diff --git a/.github/workflows/sanity_check.yml b/.github/workflows/sanity_check.yml index 3ee92fc85c2..27685115584 100644 --- a/.github/workflows/sanity_check.yml +++ b/.github/workflows/sanity_check.yml @@ -32,7 +32,7 @@ jobs: - name: Download clang-format-diff.py uses: wei/wget@v1 with: - args: https://raw.githubusercontent.com/llvm-mirror/clang/master/tools/clang-format/clang-format-diff.py + args: https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py - name: Check format run: VERBOSE_CHECK=1 make check-format diff --git a/.gitignore b/.gitignore index e55b306a5cb..e40ce2c2521 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ rocksdb.pc *.gcda *.gcno *.o +*.o.tmp *.so *.so.* *_test @@ -93,3 +94,8 @@ compile_commands.json .clangd clang-format-diff.py .py3/ + +fuzz/proto/gen/ +fuzz/crash-* + +cmake-build-* diff --git a/.travis.yml b/.travis.yml index 9a0c04afac7..8b4ed913c71 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,21 +3,21 @@ dist: trusty language: cpp os: - linux - - osx arch: - - amd64 - arm64 - ppc64le compiler: - gcc -osx_image: xcode9.4 cache: directories: - "$BUILD_DIR/aws" addons: - apt: - packages: - - zlib1g-dev + apt: + update: true + sources: + - ubuntu-toolchain-r-test + packages: + - libgflags-dev - libbz2-dev - libsnappy-dev - curl diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c5bf804ffe..ec149e9be0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,10 +32,11 @@ # 3. cmake .. # 4. make -j -cmake_minimum_required(VERSION 3.5.1) +cmake_minimum_required(VERSION 3.10) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") include(ReadVersion) +include(GoogleTest) get_rocksdb_version(rocksdb_VERSION) project(rocksdb VERSION ${rocksdb_VERSION} @@ -62,6 +63,7 @@ if(CCACHE_FOUND) endif(CCACHE_FOUND) option(WITH_JEMALLOC "build with JeMalloc" OFF) +option(WITH_LIBURING "build with liburing" ON) option(WITH_SNAPPY "build with SNAPPY" OFF) option(WITH_LZ4 "build with lz4" OFF) option(WITH_ZLIB "build with zlib" OFF) @@ -71,6 +73,12 @@ option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files, if (WITH_WINDOWS_UTF8_FILENAMES) add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES) endif() + +if ($ENV{CIRCLECI}) + message(STATUS "Build for CircieCI env, a few tests may be disabled") + add_definitions(-DCIRCLECI) +endif() + # third-party/folly is only validated to work on Linux and Windows for now. # So only turn it on there by default. if(CMAKE_SYSTEM_NAME MATCHES "Linux|Windows") @@ -89,10 +97,9 @@ if( NOT DEFINED CMAKE_CXX_STANDARD ) endif() include(CMakeDependentOption) -CMAKE_DEPENDENT_OPTION(WITH_GFLAGS "build with GFlags" ON - "NOT MSVC;NOT MINGW" OFF) if(MSVC) + option(WITH_GFLAGS "build with GFlags" OFF) option(WITH_XPRESS "build with windows built in compression" OFF) include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc) else() @@ -108,6 +115,11 @@ else() endif() endif() + if(MINGW) + option(WITH_GFLAGS "build with GFlags" OFF) + else() + option(WITH_GFLAGS "build with GFlags" ON) + endif() set(GFLAGS_LIB) if(WITH_GFLAGS) # Config with namespace available since gflags 2.2.2 @@ -119,11 +131,11 @@ else() set(GFLAGS_LIB ${GFLAGS_TARGET}) else() # Config with GFLAGS_LIBRARIES available since gflags 2.1.0 - set(GFLAGS_LIB ${GFLAGS_LIBRARIES}) + set(GFLAGS_LIB ${gflags_LIBRARIES}) endif() else() find_package(gflags REQUIRED) - set(GFLAGS_LIB gflags::gflags) + set(GFLAGS_LIB gflags::gflags) endif() include_directories(${GFLAGS_INCLUDE_DIR}) list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB}) @@ -178,23 +190,25 @@ if(WITH_AWS) list(APPEND THIRDPARTY_LIBS ${AWSSDK_LINK_LIBRARIES}) endif() -string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC) -set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb") +string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC) +set(BUILD_DATE "${TS}" CACHE STRING "the time we first built rocksdb") find_package(Git) if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") - if(WIN32) - execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) - else() - execute_process(COMMAND ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_SHA COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD ) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" RESULT_VARIABLE GIT_MOD COMMAND "${GIT_EXECUTABLE}" diff-index HEAD --quiet) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=format:"%Y-%m-%d %T" --format="%ad") + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG RESULT_VARIABLE rv COMMAND "${GIT_EXECUTABLE}" symbolic-ref -q --short HEAD OUTPUT_STRIP_TRAILING_WHITESPACE) + if (rv AND NOT rv EQUAL 0) + execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match OUTPUT_STRIP_TRAILING_WHITESPACE) endif() else() set(GIT_SHA 0) + set(GIT_MOD 1) endif() - -string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}") - +string(REGEX REPLACE "[^0-9a-fA-F]+" "" GIT_SHA "${GIT_SHA}") +string(REGEX REPLACE "[^0-9: /-]+" "" GIT_DATE "${GIT_DATE}") option(WITH_MD_LIBRARY "build with MD" ON) if(WIN32 AND MSVC) @@ -207,15 +221,16 @@ endif() set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc) configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY) -add_library(build_version OBJECT ${BUILD_VERSION_CC}) -target_include_directories(build_version PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/util) + if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324") else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wstrict-prototypes") + endif() if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format -fno-asynchronous-unwind-tables") add_definitions(-D_POSIX_C_SOURCE=1) @@ -251,14 +266,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") endif(HAS_ALTIVEC) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") -if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64") CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC) if(HAS_ARMV8_CRC) message(STATUS " HAS_ARMV8_CRC yes") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") endif(HAS_ARMV8_CRC) -endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64") option(PORTABLE "build a portable binary" OFF) option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) @@ -297,6 +312,7 @@ else() endif() include(CheckCXXSourceCompiles) +set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) if(NOT MSVC) set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") endif() @@ -313,7 +329,6 @@ int main() { auto d = _mm_cvtsi128_si64(c); } " HAVE_SSE42) -unset(CMAKE_REQUIRED_FLAGS) if(HAVE_SSE42) add_definitions(-DHAVE_SSE42) add_definitions(-DHAVE_PCLMUL) @@ -321,18 +336,66 @@ elseif(FORCE_SSE42) message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") endif() +# Check if -latomic is required or not +if (NOT MSVC) + set(CMAKE_REQUIRED_FLAGS "--std=c++11") + CHECK_CXX_SOURCE_COMPILES(" +#include +std::atomic x(0); +int main() { + uint64_t i = x.load(std::memory_order_relaxed); + bool b = x.is_lock_free(); + return 0; +} +" BUILTIN_ATOMIC) +if (NOT BUILTIN_ATOMIC) + #TODO: Check if -latomic exists + list(APPEND THIRDPARTY_LIBS atomic) +endif() +endif() + +if (WITH_LIBURING) + set(CMAKE_REQUIRED_FLAGS "-luring") + CHECK_CXX_SOURCE_COMPILES(" +#include +int main() { + struct io_uring ring; + io_uring_queue_init(1, &ring, 0); + return 0; +} +" HAS_LIBURING) + if (HAS_LIBURING) + add_definitions(-DROCKSDB_IOURING_PRESENT) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -luring") + endif() +endif() + +# Reset the required flags +set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) + CHECK_CXX_SOURCE_COMPILES(" #if defined(_MSC_VER) && !defined(__thread) #define __thread __declspec(thread) #endif int main() { static __thread int tls; + (void)tls; } " HAVE_THREAD_LOCAL) if(HAVE_THREAD_LOCAL) add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL) endif() +option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON) +if (NOT WITH_IOSTATS_CONTEXT) + add_definitions(-DNIOSTATS_CONTEXT) +endif() + +option(WITH_PERF_CONTEXT "Enable perf context" ON) +if (NOT WITH_PERF_CONTEXT) + add_definitions(-DNPERF_CONTEXT) +endif() + option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON) if(FAIL_ON_WARNINGS) if(MSVC) @@ -474,11 +537,6 @@ if(CMAKE_SYSTEM_NAME MATCHES "Cygwin") add_definitions(-fno-builtin-memcmp -DCYGWIN) elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") add_definitions(-DOS_MACOSX) - if(CMAKE_SYSTEM_PROCESSOR MATCHES arm) - add_definitions(-DIOS_CROSS_COMPILE -DROCKSDB_LITE) - # no debug info for IOS, that will make our library big - add_definitions(-DNDEBUG) - endif() elseif(CMAKE_SYSTEM_NAME MATCHES "Linux") add_definitions(-DOS_LINUX) elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS") @@ -573,12 +631,15 @@ find_package(Threads REQUIRED) set(SOURCES cache/cache.cc + cache/cache_entry_roles.cc cache/clock_cache.cc cache/lru_cache.cc cache/sharded_cache.cc db/arena_wrapped_db_iter.cc + db/blob/blob_fetcher.cc db/blob/blob_file_addition.cc db/blob/blob_file_builder.cc + db/blob/blob_file_cache.cc db/blob/blob_file_garbage.cc db/blob/blob_file_meta.cc db/blob/blob_file_reader.cc @@ -588,7 +649,6 @@ set(SOURCES db/builder.cc db/c.cc db/column_family.cc - db/compacted_db_impl.cc db/compaction/compaction.cc db/compaction/compaction_iterator.cc db/compaction/compaction_picker.cc @@ -599,6 +659,7 @@ set(SOURCES db/compaction/sst_partitioner.cc db/convenience.cc db/db_filesnapshot.cc + db/db_impl/compacted_db_impl.cc db/db_impl/db_impl.cc db/db_impl/db_impl_write.cc db/db_impl/db_impl_compaction_flush.cc @@ -649,17 +710,20 @@ set(SOURCES db/write_batch_base.cc db/write_controller.cc db/write_thread.cc + env/composite_env.cc env/env.cc env/env_chroot.cc env/env_encryption.cc env/env_hdfs.cc env/file_system.cc env/file_system_tracer.cc + env/fs_remap.cc env/mock_env.cc file/delete_scheduler.cc file/file_prefetch_buffer.cc file/file_util.cc file/filename.cc + file/line_file_reader.cc file/random_access_file_reader.cc file/read_write_util.cc file/readahead_raf.cc @@ -694,6 +758,7 @@ set(SOURCES monitoring/thread_status_util_debug.cc options/cf_options.cc options/configurable.cc + options/customizable.cc options/db_options.cc options/options.cc options/options_helper.cc @@ -772,6 +837,7 @@ set(SOURCES util/murmurhash.cc util/random.cc util/rate_limiter.cc + util/ribbon_config.cc util/slice.cc util/file_checksum_helper.cc util/status.cc @@ -817,8 +883,11 @@ set(SOURCES utilities/simulator_cache/sim_cache.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/trace/file_trace_reader_writer.cc - utilities/transactions/lock/lock_tracker.cc - utilities/transactions/lock/point_lock_tracker.cc + utilities/transactions/lock/lock_manager.cc + utilities/transactions/lock/point/point_lock_tracker.cc + utilities/transactions/lock/point/point_lock_manager.cc + utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc + utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc utilities/transactions/optimistic_transaction_db_impl.cc utilities/transactions/optimistic_transaction.cc utilities/transactions/pessimistic_transaction.cc @@ -826,7 +895,6 @@ set(SOURCES utilities/transactions/snapshot_checker.cc utilities/transactions/transaction_base.cc utilities/transactions/transaction_db_mutex_impl.cc - utilities/transactions/transaction_lock_mgr.cc utilities/transactions/transaction_util.cc utilities/transactions/write_prepared_txn.cc utilities/transactions/write_prepared_txn_db.cc @@ -854,6 +922,20 @@ set(SOURCES db/db_impl/db_impl_remote_compaction.cc $) +list(APPEND SOURCES + utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc + utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc + utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc + utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc + utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc + utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc + utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc + utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc + utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc + utilities/transactions/lock/range/range_tree/lib/standalone_port.cc + utilities/transactions/lock/range/range_tree/lib/util/dbt.cc + utilities/transactions/lock/range/range_tree/lib/util/memarena.cc) + if(HAVE_SSE42 AND NOT MSVC) set_source_files_properties( util/crc32c.cc @@ -929,12 +1011,12 @@ else() set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT}) endif() -add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES}) +add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC}) target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) if(ROCKSDB_BUILD_SHARED) - add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES}) + add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC}) target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) @@ -952,7 +1034,7 @@ if(ROCKSDB_BUILD_SHARED) LINKER_LANGUAGE CXX VERSION ${rocksdb_VERSION} SOVERSION ${rocksdb_VERSION_MAJOR} - OUTPUT_NAME "rocksdb") + OUTPUT_NAME "rocksdb${ARTIFACT_SUFFIX}") endif() endif() @@ -1074,15 +1156,21 @@ if(WITH_TESTS) cloud/remote_compaction_test.cc db/blob/blob_file_addition_test.cc db/blob/blob_file_builder_test.cc + db/blob/blob_file_cache_test.cc db/blob/blob_file_garbage_test.cc db/blob/blob_file_reader_test.cc + db/blob/db_blob_basic_test.cc + db/blob/db_blob_compaction_test.cc + db/blob/db_blob_corruption_test.cc db/blob/db_blob_index_test.cc db/column_family_test.cc db/compact_files_test.cc + db/compaction/clipping_iterator_test.cc db/compaction/compaction_job_stats_test.cc db/compaction/compaction_job_test.cc db/compaction/compaction_iterator_test.cc db/compaction/compaction_picker_test.cc + db/compaction/compaction_service_test.cc db/comparator_db_test.cc db/corruption_test.cc db/cuckoo_table_db_test.cc @@ -1098,6 +1186,7 @@ if(WITH_TESTS) db/db_iter_test.cc db/db_iter_stress_test.cc db/db_iterator_test.cc + db/db_kv_checksum_test.cc db/db_log_iter_test.cc db/db_memtable_test.cc db/db_merge_operator_test.cc @@ -1105,7 +1194,7 @@ if(WITH_TESTS) db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc - db/db_impl/db_secondary_test.cc + db/db_secondary_test.cc db/db_sst_test.cc db/db_statistics_test.cc db/db_table_properties_test.cc @@ -1169,6 +1258,7 @@ if(WITH_TESTS) monitoring/statistics_test.cc monitoring/stats_history_test.cc options/configurable_test.cc + options/customizable_test.cc options/options_settable_test.cc options/options_test.cc table/block_based/block_based_filter_block_test.cc @@ -1185,6 +1275,7 @@ if(WITH_TESTS) table/table_test.cc table/block_fetcher_test.cc test_util/testutil_test.cc + trace_replay/block_cache_tracer_test.cc trace_replay/io_tracer_test.cc tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc tools/io_tracer_parser_test.cc @@ -1205,6 +1296,7 @@ if(WITH_TESTS) util/random_test.cc util/rate_limiter_test.cc util/repeatable_thread_test.cc + util/ribbon_test.cc util/slice_test.cc util/slice_transform_test.cc util/timer_queue_test.cc @@ -1231,9 +1323,10 @@ if(WITH_TESTS) utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/transactions/optimistic_transaction_test.cc utilities/transactions/transaction_test.cc - utilities/transactions/transaction_lock_mgr_test.cc + utilities/transactions/lock/point/point_lock_manager_test.cc utilities/transactions/write_prepared_transaction_test.cc utilities/transactions/write_unprepared_transaction_test.cc + utilities/transactions/lock/range/range_locking_test.cc utilities/ttl/ttl_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc ) @@ -1268,21 +1361,21 @@ if(WITH_TESTS) foreach(sourcefile ${TESTS}) get_filename_component(exename ${sourcefile} NAME_WE) - add_executable(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} ${sourcefile}) - set_target_properties(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} + add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}) + set_target_properties(${exename}${ARTIFACT_SUFFIX} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1 EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX} ) - target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB}) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB}) if(NOT "${exename}" MATCHES "db_sanity_test") - add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX}) - add_dependencies(check ${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX}) + gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120) + add_dependencies(check ${exename}${ARTIFACT_SUFFIX}) endif() if("${exename}" MATCHES "env_librados_test") # env_librados_test.cc uses librados directly - target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} rados) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} rados) endif() endforeach(sourcefile ${TESTS}) @@ -1307,40 +1400,42 @@ if(WITH_TESTS) endif() if(WITH_BENCHMARK_TOOLS) - add_executable(db_bench + add_executable(db_bench${ARTIFACT_SUFFIX} + tools/simulated_hybrid_file_system.cc tools/db_bench.cc tools/db_bench_tool.cc) - target_link_libraries(db_bench + target_link_libraries(db_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) - add_executable(cache_bench - cache/cache_bench.cc) - target_link_libraries(cache_bench + add_executable(cache_bench${ARTIFACT_SUFFIX} + cache/cache_bench.cc + cache/cache_bench_tool.cc) + target_link_libraries(cache_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(memtablerep_bench + add_executable(memtablerep_bench${ARTIFACT_SUFFIX} memtable/memtablerep_bench.cc) - target_link_libraries(memtablerep_bench + target_link_libraries(memtablerep_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(range_del_aggregator_bench + add_executable(range_del_aggregator_bench${ARTIFACT_SUFFIX} db/range_del_aggregator_bench.cc) - target_link_libraries(range_del_aggregator_bench + target_link_libraries(range_del_aggregator_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(table_reader_bench + add_executable(table_reader_bench${ARTIFACT_SUFFIX} table/table_reader_bench.cc) - target_link_libraries(table_reader_bench + target_link_libraries(table_reader_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} testharness ${GFLAGS_LIB}) - add_executable(filter_bench + add_executable(filter_bench${ARTIFACT_SUFFIX} util/filter_bench.cc) - target_link_libraries(filter_bench + target_link_libraries(filter_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB}) - add_executable(hash_table_bench + add_executable(hash_table_bench${ARTIFACT_SUFFIX} utilities/persistent_cache/hash_table_bench.cc) - target_link_libraries(hash_table_bench + target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB}) endif() diff --git a/HISTORY.md b/HISTORY.md index a4a66317dc7..837bb1b73e4 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,34 +1,223 @@ # Rocksdb Change Log -## 6.14.6 (12/01/2020) +## 6.22.1 (2021-06-25) ### Bug Fixes -* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries. +* `GetLiveFilesMetaData()` now populates the `temperature`, `oldest_ancester_time`, and `file_creation_time` fields of its `LiveFileMetaData` results when the information is available. Previously these fields always contained zero indicating unknown. + +## 6.22.0 (2021-06-18) +### Behavior Changes +* Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time. +* Added API comments clarifying safe usage of Disable/EnableManualCompaction and EventListener callbacks for compaction. -## 6.14.5 (11/15/2020) ### Bug Fixes -* Fix a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer. -* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected. +* fs_posix.cc GetFreeSpace() always report disk space available to root even when running as non-root. Linux defaults often have disk mounts with 5 to 10 percent of total space reserved only for root. Out of space could result for non-root users. +* Subcompactions are now disabled when user-defined timestamps are used, since the subcompaction boundary picking logic is currently not timestamp-aware, which could lead to incorrect results when different subcompactions process keys that only differ by timestamp. +* Fix an issue that `DeleteFilesInRange()` may cause ongoing compaction reports corruption exception, or ASSERT for debug build. There's no actual data loss or corruption that we find. +* Fixed confusingly duplicated output in LOG for periodic stats ("DUMPING STATS"), including "Compaction Stats" and "File Read Latency Histogram By Level". +* Fixed performance bugs in background gathering of block cache entry statistics, that could consume a lot of CPU when there are many column families with a shared block cache. + +### New Features +* Marked the Ribbon filter and optimize_filters_for_memory features as production-ready, each enabling memory savings for Bloom-like filters. Use `NewRibbonFilterPolicy` in place of `NewBloomFilterPolicy` to use Ribbon filters instead of Bloom, or `ribbonfilter` in place of `bloomfilter` in configuration string. +* Allow `DBWithTTL` to use `DeleteRange` api just like other DBs. `DeleteRangeCF()` which executes `WriteBatchInternal::DeleteRange()` has been added to the handler in `DBWithTTLImpl::Write()` to implement it. +* Add BlockBasedTableOptions.prepopulate_block_cache. If enabled, it prepopulate warm/hot data blocks which are already in memory into block cache at the time of flush. On a flush, the data block that is in memory (in memtables) get flushed to the device. If using Direct IO, additional IO is incurred to read this data back into memory again, which is avoided by enabling this option and it also helps with Distributed FileSystem. More details in include/rocksdb/table.h. +* Added a `cancel` field to `CompactRangeOptions`, allowing individual in-process manual range compactions to be cancelled. -## 6.14.4 (11/05/2020) +## 6.21.0 (2021-05-21) ### Bug Fixes -Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio. +* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened. +* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results. +* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`. +* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted. +* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed. +* Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe(). +* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file. +* Fixed a bug that `AdvancedColumnFamilyOptions.max_compaction_bytes` is under-calculated for manual compaction (`CompactRange()`). Manual compaction is split to multiple compactions if the compaction size exceed the `max_compaction_bytes`. The bug creates much larger compaction which size exceed the user setting. On the other hand, larger manual compaction size can increase the subcompaction parallelism, you can tune that by setting `max_compaction_bytes`. + +### Behavior Changes +* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status. + +### New Features +* Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size. +* Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`. +* Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree. +* Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`. +* Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support. +* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs. +* Add `TableProperties::num_filter_entries`, which can be used with `TableProperties::filter_size` to calculate the effective bits per filter entry (unique user key or prefix) for a table file. + +### Performance Improvements +* BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches. + +### Public API change +* Removed a parameter from TableFactory::NewTableBuilder, which should not be called by user code because TableBuilder is not a public API. +* Removed unused structure `CompactionFilterContext`. +* The `skip_filters` parameter to SstFileWriter is now considered deprecated. Use `BlockBasedTableOptions::filter_policy` to control generation of filters. +* ClockCache is known to have bugs that could lead to crash or corruption, so should not be used until fixed. Use NewLRUCache instead. +* Added a new pure virtual function `ApplyToAllEntries` to `Cache`, to replace `ApplyToAllCacheEntries`. Custom `Cache` implementations must add an implementation. Because this function is for gathering statistics, an empty implementation could be acceptable for some applications. +* Added the ObjectRegistry to the ConfigOptions class. This registry instance will be used to find any customizable loadable objects during initialization. +* Expanded the ObjectRegistry functionality to allow nested ObjectRegistry instances. Added methods to register a set of functions with the registry/library as a group. +* Deprecated backupable_db.h and BackupableDBOptions in favor of new versions with appropriate names: backup_engine.h and BackupEngineOptions. Old API compatibility is preserved. + +### Default Option Change +* When options.arena_block_size <= 0 (default value 0), still use writer_buffer_size / 8 but cap to 1MB. Too large alloation size might not be friendly to allocator and might cause performance issues in extreme cases. + +### Build +* By default, try to build with liburing. For make, if ROCKSDB_USE_IO_URING is not set, treat as enable, which means RocksDB will try to build with liburing. Users can disable it with ROCKSDB_USE_IO_URING=0. For cmake, add WITH_LIBURING to control it, with default on. + +## 6.20.0 (2021-04-16) +### Behavior Changes +* `ColumnFamilyOptions::sample_for_compression` now takes effect for creation of all block-based tables. Previously it only took effect for block-based tables created by flush. +* `CompactFiles()` can no longer compact files from lower level to up level, which has the risk to corrupt DB (details: #8063). The validation is also added to all compactions. +* Fixed some cases in which DB::OpenForReadOnly() could write to the filesystem. If you want a Logger with a read-only DB, you must now set DBOptions::info_log yourself, such as using CreateLoggerFromOptions(). +* get_iostats_context() will never return nullptr. If thread-local support is not available, and user does not opt-out iostats context, then compilation will fail. The same applies to perf context as well. +* Added support for WriteBatchWithIndex::NewIteratorWithBase when overwrite_key=false. Previously, this combination was not supported and would assert or return nullptr. +* Improve the behavior of WriteBatchWithIndex for Merge operations. Now more operations may be stored in order to return the correct merged result. -## 6.14.3 (10/30/2020) ### Bug Fixes -* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed. -* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version. +* Use thread-safe `strerror_r()` to get error messages. +* Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`) +* Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory. +* Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones. +* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result. +* Fixed a bug that allowed `DBOptions::max_open_files` to be set with a non-negative integer with `ColumnFamilyOptions::compaction_style = kCompactionStyleFIFO`. + +### Performance Improvements +* On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance. + +### Public API change +* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead. +* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace +* Added `FlushReason::kWalFull`, which is reported when a memtable is flushed due to the WAL reaching its size limit; those flushes were previously reported as `FlushReason::kWriteBufferManager`. Also, changed the reason for flushes triggered by the write buffer manager to `FlushReason::kWriteBufferManager`; they were previously reported as `FlushReason::kWriteBufferFull`. +* Extend file_checksum_dump ldb command and DB::GetLiveFilesChecksumInfo API for IntegratedBlobDB and get checksum of blob files along with SST files. -## 6.14.2 (10/21/2020) +### New Features +* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true. +* Added BackupEngine support for integrated BlobDB, with blob files shared between backups when table files are shared. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up. +* Added an optional output parameter to BackupEngine::CreateNewBackup(WithMetadata) to return the BackupID of the new backup. +* Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups. +* Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change. + +## 6.19.0 (2021-03-21) ### Bug Fixes -* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14. +* Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file. +* When hitting a write slowdown condition, no write delay (previously 1 millisecond) is imposed until `delayed_write_rate` is actually exceeded, with an initial burst allowance of 1 millisecond worth of bytes. Also, beyond the initial burst allowance, `delayed_write_rate` is now more strictly enforced, especially with multiple column families. + +### Public API change +* Changed default `BackupableDBOptions::share_files_with_checksum` to `true` and deprecated `false` because of potential for data loss. Note that accepting this change in behavior can temporarily increase backup data usage because files are not shared between backups using the two different settings. Also removed obsolete option kFlagMatchInterimNaming. +* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change. +* Add suppport to extend DB::VerifyFileChecksums API to also verify blob files checksum. +* When using the new BlobDB, the amount of data written by flushes/compactions is now broken down into table files and blob files in the compaction statistics; namely, Write(GB) denotes the amount of data written to table files, while Wblob(GB) means the amount of data written to blob files. +* New default BlockBasedTableOptions::format_version=5 to enable new Bloom filter implementation by default, compatible with RocksDB versions >= 6.6.0. +* Add new SetBufferSize API to WriteBufferManager to allow dynamic management of memory allotted to all write buffers. This allows user code to adjust memory monitoring provided by WriteBufferManager as process memory needs change datasets grow and shrink. +* Clarified the required semantics of Read() functions in FileSystem and Env APIs. Please ensure any custom implementations are compliant. +* For the new integrated BlobDB implementation, compaction statistics now include the amount of data read from blob files during compaction (due to garbage collection or compaction filters). Write amplification metrics have also been extended to account for data read from blob files. +* Add EqualWithoutTimestamp() to Comparator. +* Extend support to track blob files in SSTFileManager whenever a blob file is created/deleted. Blob files will be scheduled to delete via SSTFileManager and SStFileManager will now take blob files in account while calculating size and space limits along with SST files. +* Add new Append and PositionedAppend API with checksum handoff to legacy Env. + +### New Features +* Support compaction filters for the new implementation of BlobDB. Add `FilterBlobByKey()` to `CompactionFilter`. Subclasses can override this method so that compaction filters can determine whether the actual blob value has to be read during compaction. Use a new `kUndetermined` in `CompactionFilter::Decision` to indicated that further action is necessary for compaction filter to make a decision. +* Add support to extend retrieval of checksums for blob files from the MANIFEST when checkpointing. During backup, rocksdb can detect corruption in blob files during file copies. +* Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1. +* Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage. +* Enable backward iteration on keys with user-defined timestamps. +* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries. + +### Behavior Changes +* During flush, only WAL sync retryable IO error is mapped to hard error, which will stall the writes. When WAL is used but only SST file write has retryable IO error, it will be mapped to soft error and write will not be affected. + +## 6.18.0 (2021-02-19) +### Behavior Changes +* When retryable IO error occurs during compaction, it is mapped to soft error and set the BG error. However, auto resume is not called to clean the soft error since compaction will reschedule by itself. In this change, When retryable IO error occurs during compaction, BG error is not set. User will be informed the error via EventHelper. +* Introduce a new trace file format for query tracing and replay and trace file version is bump up to 0.2. A payload map is added as the first portion of the payload. We will not have backward compatible issues when adding new entries to trace records. Added the iterator_upper_bound and iterator_lower_bound in Seek and SeekForPrev tracing function. Added them as the new payload member for iterator tracing. + +### New Features +* Add support for key-value integrity protection in live updates from the user buffers provided to `WriteBatch` through the write to RocksDB's in-memory update buffer (memtable). This is intended to detect some cases of in-memory data corruption, due to either software or hardware errors. Users can enable protection by constructing their `WriteBatch` with `protection_bytes_per_key == 8`. +* Add support for updating `full_history_ts_low` option in manual compaction, which is for old timestamp data GC. +* Add a mechanism for using Makefile to build external plugin code into the RocksDB libraries/binaries. This intends to simplify compatibility and distribution for plugins (e.g., special-purpose `FileSystem`s) whose source code resides outside the RocksDB repo. See "plugin/README.md" for developer details, and "PLUGINS.md" for a listing of available plugins. +* Added memory pre-fetching for experimental Ribbon filter, which especially optimizes performance with batched MultiGet. +* A new, experimental version of BlobDB (key-value separation) is now available. The new implementation is integrated into the RocksDB core, i.e. it is accessible via the usual `rocksdb::DB` API, as opposed to the separate `rocksdb::blob_db::BlobDB` interface used by the earlier version, and can be configured on a per-column family basis using the configuration options `enable_blob_files`, `min_blob_size`, `blob_file_size`, `blob_compression_type`, `enable_blob_garbage_collection`, and `blob_garbage_collection_age_cutoff`. It extends RocksDB's consistency guarantees to blobs, and offers more features and better performance. Note that some features, most notably `Merge`, compaction filters, and backup/restore are not yet supported, and there is no support for migrating a database created by the old implementation. -## 6.14.1 (10/13/2020) ### Bug Fixes -* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121). -* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files. +* Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details. +* `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. +* Fix `WRITE_PREPARED`, `WRITE_UNPREPARED` TransactionDB `MultiGet()` may return uncommitted data with snapshot. +* In DB::OpenForReadOnly, if any error happens while checking Manifest file path, it was overridden by Status::NotFound. It has been fixed and now actual error is returned. + +### Public API Change +* Added a "only_mutable_options" flag to the ConfigOptions. When this flag is "true", the Configurable functions and convenience methods (such as GetDBOptionsFromString) will only deal with options that are marked as mutable. When this flag is true, only options marked as mutable can be configured (a Status::InvalidArgument will be returned) and options not marked as mutable will not be returned or compared. The default is "false", meaning to compare all options. +* Add new Append and PositionedAppend APIs to FileSystem to bring the data verification information (data checksum information) from upper layer (e.g., WritableFileWriter) to the storage layer. In this way, the customized FileSystem is able to verify the correctness of data being written to the storage on time. Add checksum_handoff_file_types to DBOptions. User can use this option to control which file types (Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.) should use the new Append and PositionedAppend APIs to handoff the verification information. Currently, RocksDB only use crc32c to calculate the checksum for write handoff. +* Add an option, `CompressionOptions::max_dict_buffer_bytes`, to limit the in-memory buffering for selecting samples for generating/training a dictionary. The limit is currently loosely adhered to. + + +## 6.17.0 (2021-01-15) +### Behavior Changes +* When verifying full file checksum with `DB::VerifyFileChecksums()`, we now fail with `Status::InvalidArgument` if the name of the checksum generator used for verification does not match the name of the checksum generator used for protecting the file when it was created. +* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`. + +### Bug Fixes +* Version older than 6.15 cannot decode VersionEdits `WalAddition` and `WalDeletion`, fixed this by changing the encoded format of them to be ignorable by older versions. +* Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated. + +### Public API Change +* Add a public API WriteBufferManager::dummy_entries_in_cache_usage() which reports the size of dummy entries stored in cache (passed to WriteBufferManager). Dummy entries are used to account for DataBlocks. +* Add a SystemClock class that contains the time-related methods from Env. The original methods in Env may be deprecated in a future release. This class will allow easier testing, development, and expansion of time-related features. +* Add a public API GetRocksBuildProperties and GetRocksBuildInfoAsString to get properties about the current build. These properties may include settings related to the GIT settings (branch, timestamp). This change also sets the "build date" based on the GIT properties, rather than the actual build time, thereby enabling more reproducible builds. + +## 6.16.0 (2020-12-18) +### Behavior Changes +* Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation. + +### Bug Fixes +* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`. +* Fix a bug where compressed blocks read by MultiGet are not inserted into the compressed block cache when use_direct_reads = true. +* Fixed the issue of full scanning on obsolete files when there are too many outstanding compactions with ConcurrentTaskLimiter enabled. +* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected. +* Fixed prefix extractor with timestamp issues. +* Fixed a bug in atomic flush: in two-phase commit mode, the minimum WAL log number to keep is incorrect. +* Fixed a bug related to checkpoint in PR7789: if there are multiple column families, and the checkpoint is not opened as read only, then in rare cases, data loss may happen in the checkpoint. Since backup engine relies on checkpoint, it may also be affected. +* When ldb --try_load_options is used with the --column_family option, the ColumnFamilyOptions for the specified column family was not loaded from the OPTIONS file. Fix it so its loaded from OPTIONS and then overridden with command line overrides. + +### New Features +* User defined timestamp feature supports `CompactRange` and `GetApproximateSizes`. +* Support getting aggregated table properties (kAggregatedTableProperties and kAggregatedTablePropertiesAtLevel) with DB::GetMapProperty, for easier access to the data in a structured format. +* Experimental option BlockBasedTableOptions::optimize_filters_for_memory now works with experimental Ribbon filter (as well as Bloom filter). + +### Public API Change +* Deprecated public but rarely-used FilterBitsBuilder::CalculateNumEntry, which is replaced with ApproximateNumEntries taking a size_t parameter and returning size_t. +* To improve portability the functions `Env::GetChildren` and `Env::GetChildrenFileAttributes` will no longer return entries for the special directories `.` or `..`. +* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance. +* `rocksdb_approximate_sizes` and `rocksdb_approximate_sizes_cf` in the C API now requires an error pointer (`char** errptr`) for receiving any error. +* All overloads of DB::GetApproximateSizes now return Status, so that any failure to obtain the sizes is indicated to the caller. + +## 6.15.0 (2020-11-13) +### Bug Fixes * Fixed a bug in the following combination of features: indexes with user keys (`format_version >= 3`), indexes are partitioned (`index_type == kTwoLevelIndexSearch`), and some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`). The bug could cause keys to be truncated when read from the index leading to wrong read results or other unexpected behavior. * Fixed a bug when indexes are partitioned (`index_type == kTwoLevelIndexSearch`), some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`), and partitions reads could be mixed between block cache and directly from the file (e.g., with `enable_index_compression == 1` and `mmap_read == 1`, partitions that were stored uncompressed due to poor compression ratio would be read directly from the file via mmap, while partitions that were stored compressed would be read from block cache). The bug could cause index partitions to be mistakenly considered empty during reads leading to wrong read results. +* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121). +* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files. +* Since 6.14, fix a bug that could cause a stalled write to crash with mixed of slowdown and no_slowdown writes (`WriteOptions.no_slowdown=true`). +* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14. +* Fixed a bug which causes Get() to return incorrect result when a key's merge operand is applied twice. This can occur if the thread performing Get() runs concurrently with a background flush thread and another thread writing to the MANIFEST file (PR6069). +* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed. +* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version. +* Fixed MultiGet bugs it doesn't return valid data with user defined timestamp. +* Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio. +* Fixed a seek issue with prefix extractor and timestamp. +* Fixed a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer. +* Fixed a bug of a recovery corner case, details in PR7621. + +### Public API Change +* Deprecate `BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache` and `BlockBasedTableOptions::pin_top_level_index_and_filter`. These options still take effect until users migrate to the replacement APIs in `BlockBasedTableOptions::metadata_cache_options`. Migration guidance can be found in the API comments on the deprecated options. +* Add new API `DB::VerifyFileChecksums` to verify SST file checksum with corresponding entries in the MANIFEST if present. Current implementation requires scanning and recomputing file checksums. + +### Behavior Changes +* The dictionary compression settings specified in `ColumnFamilyOptions::compression_opts` now additionally affect files generated by flush and compaction to non-bottommost level. Previously those settings at most affected files generated by compaction to bottommost level, depending on whether `ColumnFamilyOptions::bottommost_compression_opts` overrode them. Users who relied on dictionary compression settings in `ColumnFamilyOptions::compression_opts` affecting only the bottommost level can keep the behavior by moving their dictionary settings to `ColumnFamilyOptions::bottommost_compression_opts` and setting its `enabled` flag. +* When the `enabled` flag is set in `ColumnFamilyOptions::bottommost_compression_opts`, those compression options now take effect regardless of the value in `ColumnFamilyOptions::bottommost_compression`. Previously, those compression options only took effect when `ColumnFamilyOptions::bottommost_compression != kDisableCompressionOption`. Now, they additionally take effect when `ColumnFamilyOptions::bottommost_compression == kDisableCompressionOption` (such a setting causes bottommost compression type to fall back to `ColumnFamilyOptions::compression_per_level` if configured, and otherwise fall back to `ColumnFamilyOptions::compression`). + +### New Features +* An EXPERIMENTAL new Bloom alternative that saves about 30% space compared to Bloom filters, with about 3-4x construction time and similar query times is available using NewExperimentalRibbonFilterPolicy. -## 6.14 (10/09/2020) +## 6.14 (2020-10-09) ### Bug fixes * Fixed a bug after a `CompactRange()` with `CompactRangeOptions::change_level` set fails due to a conflict in the level change step, which caused all subsequent calls to `CompactRange()` with `CompactRangeOptions::change_level` set to incorrectly fail with a `Status::NotSupported("another thread is refitting")` error. * Fixed a bug that the bottom most level compaction could still be a trivial move even if `BottommostLevelCompaction.kForce` or `kForceOptimized` is set. @@ -44,12 +233,15 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * The settings of the DBOptions and ColumnFamilyOptions are now managed by Configurable objects (see New Features). The same convenience methods to configure these options still exist but the backend implementation has been unified under a common implementation. ### New Features + * Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit). This change will allow for better and more thorough configuration management and retrieval in the future. The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method. The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods. The list of options supported by an object can be obtained via the GetOptionNames method. The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method. Configurable options can be compared via the AreEquivalent method. The settings within a Configurable object may be validated via the ValidateOptions method. The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method. * Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail. * Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface. * Add more stats for MultiGet in Histogram to get number of data blocks, index blocks, filter blocks and sst files read from file system per level. +* SST files have a new table property called db_host_id, which is set to the hostname by default. A new option in DBOptions, db_host_id, allows the property value to be overridden with a user specified string, or disable it completely by making the option string empty. +* Methods to create customizable extensions -- such as TableFactory -- are exposed directly through the Customizable base class (from which these objects inherit). This change will allow these Customizable classes to be loaded and configured in a standard way (via CreateFromString). More information on how to write and use Customizable classes is in the customizable.h header file. -## 6.13 (09/12/2020) +## 6.13 (2020-09-12) ### Bug fixes * Fix a performance regression introduced in 6.4 that makes a upper bound check for every Next() even if keys are within a data block that is within the upper bound. * Fix a possible corruption to the LSM state (overlapping files within a level) when a `CompactRange()` for refitting levels (`CompactRangeOptions::change_level == true`) and another manual compaction are executed in parallel. @@ -129,7 +321,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Reduce key comparisons during random access in all block-based tables. * BackupEngine avoids unnecessary repeated checksum computation for backing up a table file to the `shared_checksum` directory when using `share_files_with_checksum_naming = kUseDbSessionId` (new default), except on SST files generated before this version of RocksDB, which fall back on using `kLegacyCrc32cAndFileSize`. -## 6.11 (6/12/2020) +## 6.11 (2020-06-12) ### Bug Fixes * Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true. * Fix possible false NotFound status from batched MultiGet using index type kHashSearch. @@ -164,7 +356,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before ### Performance Improvements * Eliminate redundant key comparisons during random access in block-based tables. -## 6.10 (5/2/2020) +## 6.10 (2020-05-02) ### Bug Fixes * Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced. * Finish implementation of BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It's now ready for use. Significantly reduces read amplification in some setups, especially for iterator seeks. @@ -196,7 +388,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Improve performance of batch MultiGet with partitioned filters, by sharing block cache lookups to applicable filter blocks. * Reduced memory copies when fetching and uncompressing compressed blocks from sst files. -## 6.9.0 (03/29/2020) +## 6.9.0 (2020-03-29) ### Behavior changes * Since RocksDB 6.8, ttl-based FIFO compaction can drop a file whose oldest key becomes older than options.ttl while others have not. This fix reverts this and makes ttl-based FIFO compaction use the file's flush time as the criterion. This fix also requires that max_open_files = -1 and compaction_options_fifo.allow_compaction = false to function properly. @@ -223,7 +415,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Added a new option, best_efforts_recovery (default: false), to allow database to open in a db dir with missing table files. During best efforts recovery, missing table files are ignored, and database recovers to the most recent state without missing table file. Cross-column-family consistency is not guaranteed even if WAL is enabled. * options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts are now dynamically changeable. -## 6.8.0 (02/24/2020) +## 6.8.0 (2020-02-24) ### Java API Changes * Major breaking changes to Java comparators, toward standardizing on ByteBuffer for performant, locale-neutral operations on keys (#6252). * Added overloads of common API methods using direct ByteBuffers for keys and values (#2283). @@ -250,21 +442,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * `db_bench` now supports `value_size_distribution_type`, `value_size_min`, `value_size_max` options for generating random variable sized value. Added `blob_db_compression_type` option for BlobDB to enable blob compression. * Replace RocksDB namespace "rocksdb" with flag "ROCKSDB_NAMESPACE" which if is not defined, defined as "rocksdb" in header file rocksdb_namespace.h. -## 6.7.3 (03/18/2020) -### Bug Fixes -* Fix a data race that might cause crash when calling DB::GetCreationTimeOfOldestFile() by a small chance. The bug was introduced in 6.6 Release. - -## 6.7.2 (02/24/2020) -### Bug Fixes -* Fixed a bug of IO Uring partial result handling introduced in 6.7.0. - - -## 6.7.1 (02/13/2020) -### Bug Fixes -* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space. -* Batched MultiGet() ignores IO errors while reading data blocks, causing it to potentially continue looking for a key and returning stale results. - -## 6.7.0 (01/21/2020) +## 6.7.0 (2020-01-21) ### Public API Change * Added a rocksdb::FileSystem class in include/rocksdb/file_system.h to encapsulate file creation/read/write operations, and an option DBOptions::file_system to allow a user to pass in an instance of rocksdb::FileSystem. If its a non-null value, this will take precendence over DBOptions::env for file operations. A new API rocksdb::FileSystem::Default() returns a platform default object. The DBOptions::env option and Env::Default() API will continue to be used for threading and other OS related functions, and where DBOptions::file_system is not specified, for file operations. For storage developers who are accustomed to rocksdb::Env, the interface in rocksdb::FileSystem is new and will probably undergo some changes as more storage systems are ported to it from rocksdb::Env. As of now, no env other than Posix has been ported to the new interface. * A new rocksdb::NewSstFileManager() API that allows the caller to pass in separate Env and FileSystem objects. @@ -289,11 +467,11 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound. * MultiGet() can use IO Uring to parallelize read from the same SST file. This featuer is by default disabled. It can be enabled with environment variable ROCKSDB_USE_IO_URING. -## 6.6.2 (01/13/2020) +## 6.6.2 (2020-01-13) ### Bug Fixes * Fixed a bug where non-L0 compaction input files were not considered to compute the `creation_time` of new compaction outputs. -## 6.6.1 (01/02/2020) +## 6.6.1 (2020-01-02) ### Bug Fixes * Fix a bug in WriteBatchWithIndex::MultiGetFromBatchAndDB, which is called by Transaction::MultiGet, that causes due to stale pointer access when the number of keys is > 32 * Fixed two performance issues related to memtable history trimming. First, a new SuperVersion is now created only if some memtables were actually trimmed. Second, trimming is only scheduled if there is at least one flushed memtable that is kept in memory for the purposes of transaction conflict checking. @@ -303,7 +481,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Delete superversions in BackgroundCallPurge. * Fix use-after-free and double-deleting files in BackgroundCallPurge(). -## 6.6.0 (11/25/2019) +## 6.6.0 (2019-11-25) ### Bug Fixes * Fix data corruption caused by output of intra-L0 compaction on ingested file not being placed in correct order in L0. * Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression. @@ -356,19 +534,19 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement. * Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom. -## 6.5.2 (11/15/2019) +## 6.5.2 (2019-11-15) ### Bug Fixes * Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured. * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files. -## 6.5.1 (10/16/2019) +## 6.5.1 (2019-10-16) ### Bug Fixes * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound. * Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound. * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand. -## 6.5.0 (9/13/2019) +## 6.5.0 (2019-09-13) ### Bug Fixes * Fixed a number of data races in BlobDB. * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0.. @@ -389,7 +567,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before ### Performance Improvements * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance. -## 6.4.0 (7/30/2019) +## 6.4.0 (2019-07-30) ### Default Option Change * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explicitly created, the small block cache created by BlockBasedTable will still has this option to be 0.0. * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true. @@ -425,7 +603,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fixed a regression where the fill_cache read option also affected index blocks. * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well. -## 6.3.2 (8/15/2019) +## 6.3.2 (2019-08-15) ### Public API Change * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. @@ -433,11 +611,11 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fixed a regression where the fill_cache read option also affected index blocks. * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well. -## 6.3.1 (7/24/2019) +## 6.3.1 (2019-07-24) ### Bug Fixes * Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails. -## 6.3.0 (6/18/2019) +## 6.3.0 (2019-06-18) ### Public API Change * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. * Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers. @@ -474,7 +652,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix a bug caused by secondary not skipping the beginning of new MANIFEST. * On DB open, delete WAL trash files left behind in wal_dir -## 6.2.0 (4/30/2019) +## 6.2.0 (2019-04-30) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. @@ -496,7 +674,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Close a WAL file before another thread deletes it. * Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag. -## 6.1.1 (4/9/2019) +## 6.1.1 (2019-04-09) ### New Features * When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry. @@ -506,7 +684,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction. * Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries. -## 6.1.0 (3/27/2019) +## 6.1.0 (2019-03-27) ### New Features * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. * Added a feature to perform data-block sampling for compressibility, and report stats to user. @@ -524,7 +702,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms. * Fix SstFileReader not able to open file ingested with write_glbal_seqno=true. -## 6.0.0 (2/19/2019) +## 6.0.0 (2019-02-19) ### New Features * Enabled checkpoint on readonly db (DBImplReadOnly). * Make DB ignore dropped column families while committing results of atomic flush. @@ -567,7 +745,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before ### Change Default Options * Change options.compaction_pri's default to kMinOverlappingRatio -## 5.18.0 (11/30/2018) +## 5.18.0 (2018-11-30) ### New Features * Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump. * Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query. @@ -598,7 +776,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls. * Make DB ignore dropped column families while committing results of atomic flush. -## 5.17.0 (10/05/2018) +## 5.17.0 (2018-10-05) ### Public API Change * `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero. * Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not. @@ -612,21 +790,21 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction. * Sync CURRENT file contents during checkpoint. -## 5.16.3 (10/1/2018) +## 5.16.3 (2018-10-01) ### Bug Fixes * Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options. -## 5.16.2 (9/21/2018) +## 5.16.2 (2018-09-21) ### Bug Fixes * Fix bug in partition filters with format_version=4. -## 5.16.1 (9/17/2018) +## 5.16.1 (2018-09-17) ### Bug Fixes * Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file. * Fix RocksDB Java build and tests. * Remove sync point in Block destructor. -## 5.16.0 (8/21/2018) +## 5.16.0 (2018-08-21) ### Public API Change * The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons * GetAllKeyVersions() to take an extra argument of `max_num_ikeys`. @@ -640,7 +818,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before ### Bug Fixes * Fix a bug in misreporting the estimated partition index size in properties block. -## 5.15.0 (7/17/2018) +## 5.15.0 (2018-07-17) ### Public API Change * Remove managed iterator. ReadOptions.managed is not effective anymore. * For bottommost_compression, a compatible CompressionOptions is added via `bottommost_compression_opts`. To keep backward compatible, a new boolean `enabled` is added to CompressionOptions. For compression_opts, it will be always used no matter what value of `enabled` is. For bottommost_compression_opts, it will only be used when user set `enabled=true`, otherwise, compression_opts will be used for bottommost_compression as default. @@ -666,7 +844,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix a bug caused by not copying the block trailer with compressed SST file, direct IO, prefetcher and no compressed block cache. * Fix write can stuck indefinitely if enable_pipelined_write=true. The issue exists since pipelined write was introduced in 5.5.0. -## 5.14.0 (5/16/2018) +## 5.14.0 (2018-05-16) ### Public API Change * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. * The background thread naming convention changed (on supporting platforms) to "rocksdb:", e.g., "rocksdb:low0". @@ -699,7 +877,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances. * Added SstFileManager to the Java API to allow managing SST files across DB instances. -## 5.13.0 (3/20/2018) +## 5.13.0 (2018-03-20) ### Public API Change * RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version. * Remove CompactionEventListener. @@ -715,7 +893,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map. * Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread. -## 5.12.0 (2/14/2018) +## 5.12.0 (2018-02-14) ### Public API Change * Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake. * Add `include_end` option to make the range end exclusive when `include_end == false` in `DeleteFilesInRange()`. @@ -737,7 +915,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix advance reservation of arena block addresses. * Fix handling of empty string as checkpoint directory. -## 5.11.0 (01/08/2018) +## 5.11.0 (2018-01-08) ### Public API Change * Add `autoTune` and `getBytesPerSecond()` to RocksJava RateLimiter @@ -754,7 +932,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix a mislabel bug for bottom-pri compaction threads. * Fix DB::Flush() keep waiting after flush finish under certain condition. -## 5.10.0 (12/11/2017) +## 5.10.0 (2017-12-11) ### Public API Change * When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features. @@ -771,7 +949,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before * Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files. * Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker. -## 5.9.0 (11/1/2017) +## 5.9.0 (2017-11-01) ### Public API Change * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened. * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default. @@ -798,7 +976,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery. * Fix possible metadata corruption in databases using `DeleteRange()`. -## 5.8.0 (08/30/2017) +## 5.8.0 (2017-08-30) ### Public API Change * Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints. * `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr. @@ -818,7 +996,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled. * Fix potentially wrong file smallest key when range deletions separated by snapshot are written together. -## 5.7.0 (07/13/2017) +## 5.7.0 (2017-07-13) ### Public API Change * DB property "rocksdb.sstables" now prints keys in hex form. @@ -833,7 +1011,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Bug Fixes * Fix discarding empty compaction output files when `DeleteRange()` is used together with subcompactions. -## 5.6.0 (06/06/2017) +## 5.6.0 (2017-06-06) ### Public API Change * Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads. * Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction. @@ -850,7 +1028,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Bug Fixes * Shouldn't ignore return value of fsync() in flush. -## 5.5.0 (05/17/2017) +## 5.5.0 (2017-05-17) ### New Features * FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true. * DB::ResetStats() to reset internal stats. @@ -867,7 +1045,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Bug Fixes * Fix the bug that Direct I/O uses direct reads for non-SST file -## 5.4.0 (04/11/2017) +## 5.4.0 (2017-04-11) ### Public API Change * random_access_max_buffer_size no longer has any effect * Removed Env::EnableReadAhead(), Env::ShouldForwardRawRequest() @@ -884,7 +1062,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often. * (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size. -## 5.3.0 (03/08/2017) +## 5.3.0 (2017-03-08) ### Public API Change * Remove disableDataSync option. * Remove timeout_hint_us option from WriteOptions. The option has been deprecated and has no effect since 3.13.0. @@ -894,7 +1072,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Bug Fixes * Fix the bug that iterator may skip keys -## 5.2.0 (02/08/2017) +## 5.2.0 (2017-02-08) ### Public API Change * NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explicit provide one. * Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files. @@ -912,7 +1090,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Some fixes related to 2PC. * Fix bugs of data corruption in direct I/O -## 5.1.0 (01/13/2017) +## 5.1.0 (2017-01-13) * Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions(). * Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully. * BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env. @@ -921,7 +1099,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions. * When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying. -## 5.0.0 (11/17/2016) +## 5.0.0 (2016-11-17) ### Public API Change * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters. * Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions(). @@ -940,7 +1118,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add LuaCompactionFilter in utilities. This allows developers to write compaction filters in Lua. To use this feature, LUA_PATH needs to be set to the root directory of Lua. * No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory. -## 4.13.0 (10/18/2016) +## 4.13.0 (2016-10-18) ### Public API Change * DB::GetOptions() reflect dynamic changed options (i.e. through DB::SetOptions()) and return copy of options instead of reference. * Added Statistics::getAndResetTickerCount(). @@ -949,7 +1127,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add DB::SetDBOptions() to dynamic change base_background_compactions and max_background_compactions. * Added Iterator::SeekForPrev(). This new API will seek to the last key that less than or equal to the target key. -## 4.12.0 (9/12/2016) +## 4.12.0 (2016-09-12) ### Public API Change * CancelAllBackgroundWork() flushes all memtables for databases containing writes that have bypassed the WAL (writes issued with WriteOptions::disableWAL=true) before shutting down background threads. * Merge options source_compaction_factor, max_grandparent_overlap_bytes and expanded_compaction_factor into max_compaction_bytes. @@ -961,7 +1139,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Change ticker/histogram statistics implementations to accumulate data in thread-local storage, which improves CPU performance by reducing cache coherency costs. Callers of CreateDBStatistics do not need to change anything to use this feature. * Block cache mid-point insertion, where index and filter block are inserted into LRU block cache with higher priority. The feature can be enabled by setting BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority to true and high_pri_pool_ratio > 0 when creating NewLRUCache. -## 4.11.0 (8/1/2016) +## 4.11.0 (2016-08-01) ### Public API Change * options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter. @@ -969,7 +1147,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h. * Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destroying iterators. -## 4.10.0 (7/5/2016) +## 4.10.0 (2016-07-05) ### Public API Change * options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes * enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one. @@ -981,7 +1159,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family. * Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances. -## 4.9.0 (6/9/2016) +## 4.9.0 (2016-06-09) ### Public API changes * Add bottommost_compression option, This option can be used to set a specific compression algorithm for the bottommost level (Last level containing files in the DB). * Introduce CompactionJobInfo::compression, This field state the compression algorithm used to generate the output files of the compaction. @@ -991,7 +1169,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### New Features * Introduce NewSimCache() in rocksdb/utilities/sim_cache.h. This function creates a block cache that is able to give simulation results (mainly hit rate) of simulating block behavior with a configurable cache size. -## 4.8.0 (5/2/2016) +## 4.8.0 (2016-05-02) ### Public API Change * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes. * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F @@ -1001,12 +1179,12 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### New Features * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size. -## 4.7.0 (4/8/2016) +## 4.7.0 (2016-04-08) ### Public API Change * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too. * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options. -## 4.6.0 (3/10/2016) +## 4.6.0 (2016-03-10) ### Public API Changes * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier. * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly. @@ -1017,7 +1195,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned" -## 4.5.0 (2/5/2016) +## 4.5.0 (2016-02-05) ### Public API Changes * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes. * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll. @@ -1028,7 +1206,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true. * Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate. -## 4.4.0 (1/14/2016) +## 4.4.0 (2016-01-14) ### Public API Changes * Change names in CompactionPri and add a new one. * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit. @@ -1038,7 +1216,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Increase default options.delayed_write_rate to 2MB/s. * Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb. -## 4.3.0 (12/8/2015) +## 4.3.0 (2015-12-08) ### New Features * CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key. * RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions. @@ -1048,7 +1226,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Public API Changes * When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families. -## 4.2.0 (11/9/2015) +## 4.2.0 (2015-11-09) ### New Features * Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions. * Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families. @@ -1061,7 +1239,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Remove DefaultCompactionFilterFactory. -## 4.1.0 (10/8/2015) +## 4.1.0 (2015-10-08) ### New Features * Added single delete operation as a more efficient way to delete keys that have not been overwritten. * Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info. @@ -1075,7 +1253,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand. * We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13. -## 4.0.0 (9/9/2015) +## 4.0.0 (2015-09-09) ### New Features * Added support for transactions. See include/rocksdb/utilities/transaction.h for more info. * DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used. @@ -1088,7 +1266,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Added Equal() method to the Comparator interface that can optionally be overwritten in cases where equality comparisons can be done more efficiently than three-way comparisons. * Previous 'experimental' OptimisticTransaction class has been replaced by Transaction class. -## 3.13.0 (8/6/2015) +## 3.13.0 (2015-08-06) ### New Features * RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex * Add NewCompactOnDeletionCollectorFactory() in utilities/table_properties_collectors, which allows rocksdb to mark a SST file as need-compaction when it observes at least D deletion entries in any N consecutive entries in that SST file. Note that this feature depends on an experimental NeedCompact() API --- the result of this API will not persist after DB restart. @@ -1103,7 +1281,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add statistics::getHistogramString() to print detailed distribution of a histogram metric. * Add DBOptions::skip_stats_update_on_db_open. When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction. -## 3.12.0 (7/2/2015) +## 3.12.0 (2015-07-02) ### New Features * Added experimental support for optimistic transactions. See include/rocksdb/utilities/optimistic_transaction.h for more info. * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds) @@ -1133,7 +1311,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add BackupEngineImpl.options_.max_background_operations to specify the maximum number of operations that may be performed in parallel. Add support for parallelized backup and restore. * Add DB::SyncWAL() that does a WAL sync without blocking writers. -## 3.11.0 (5/19/2015) +## 3.11.0 (2015-05-19) ### New Features * Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy. * Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv @@ -1144,7 +1322,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users. * DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync -## 3.10.0 (3/24/2015) +## 3.10.0 (2015-03-24) ### New Features * GetThreadStatus() is now able to report detailed thread status, including: - Thread Operation including flush and compaction. @@ -1179,7 +1357,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`. * Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly. -## 3.9.0 (12/8/2014) +## 3.9.0 (2014-12-08) ### New Features * Add rocksdb::GetThreadList(), which in the future will return the current status of all @@ -1198,7 +1376,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Improvements * RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag. -## 3.8.0 (11/14/2014) +## 3.8.0 (2014-11-14) ### Public API changes * BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on. @@ -1212,14 +1390,14 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * CompactFiles and EventListener, although they are still in experimental state * Full ColumnFamily support in RocksJava. -## 3.7.0 (11/6/2014) +## 3.7.0 (2014-11-06) ### Public API changes * Introduce SetOptions() API to allow adjusting a subset of options dynamically online * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString() * Remove WriteBatchWithIndex.Delete() overloads using SliceParts * When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it. -## 3.6.0 (10/7/2014) +## 3.6.0 (2014-10-07) ### Disk format changes * If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy @@ -1232,7 +1410,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Change target_file_size_base type to uint64_t from int. * Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on. -## 3.5.0 (9/3/2014) +## 3.5.0 (2014-09-03) ### New Features * Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it. * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: @@ -1243,7 +1421,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Public API changes * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. -## 3.4.0 (8/18/2014) +## 3.4.0 (2014-08-18) ### New Features * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. @@ -1259,7 +1437,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke * Add DB::GetIntProperty(), which returns DB properties that are integer as uint64_t. * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. -## 3.3.0 (7/10/2014) +## 3.3.0 (2014-07-10) ### New Features * Added JSON API prototype. * HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory(). @@ -1270,7 +1448,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke ### Public API changes * Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect. -## 3.2.0 (06/20/2014) +## 3.2.0 (2014-06-20) ### Public API changes * We removed seek compaction as a concept from RocksDB because: @@ -1288,7 +1466,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a ### Performance Improvements * Tailing Iterator re-implemeted with ForwardIterator + Cascading Search Hint , see ~20% throughput improvement. -## 3.1.0 (05/21/2014) +## 3.1.0 (2014-05-21) ### Public API changes * Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories @@ -1297,7 +1475,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a * Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open. * FIFO compaction style -## 3.0.0 (05/05/2014) +## 3.0.0 (2014-05-05) ### Public API changes * Added _LEVEL to all InfoLogLevel enums @@ -1309,7 +1487,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a * Added an option to use different checksum functions in BlockBasedTableOptions * Added ApplyToAllCacheEntries() function to Cache -## 2.8.0 (04/04/2014) +## 2.8.0 (2014-04-04) * Removed arena.h from public header files. * By default, checksums are verified on every read from database @@ -1338,7 +1516,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1. * Geo-spatial support for locations and radial-search. -## 2.7.0 (01/28/2014) +## 2.7.0 (2014-01-28) ### Public API changes diff --git a/INSTALL.md b/INSTALL.md index 9163df166d3..0165e9c128d 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -43,6 +43,8 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi command line flags processing. You can compile rocksdb library even if you don't have gflags installed. +* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html) + * If you wish to build the RocksJava static target, then cmake is required for building Snappy. ## Supported platforms @@ -94,12 +96,21 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi sudo yum install libasan * Install zstandard: + * With [EPEL](https://fedoraproject.org/wiki/EPEL): + + sudo yum install libzstd-devel + + * With CentOS 8: + + sudo dnf install libzstd-devel + + * From source: - wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz - mv v1.1.3.tar.gz zstd-1.1.3.tar.gz - tar zxvf zstd-1.1.3.tar.gz - cd zstd-1.1.3 - make && sudo make install + wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz + mv v1.1.3.tar.gz zstd-1.1.3.tar.gz + tar zxvf zstd-1.1.3.tar.gz + cd zstd-1.1.3 + make && sudo make install * **OS X**: * Install latest C++ compiler that supports C++ 11: diff --git a/Makefile b/Makefile index 9f0a4923f23..7433056ad9a 100644 --- a/Makefile +++ b/Makefile @@ -55,60 +55,33 @@ DEBUG_LEVEL?=1 # Set the default LIB_MODE to static LIB_MODE?=static -ifeq ($(MAKECMDGOALS),dbg) - DEBUG_LEVEL=2 -endif +# OBJ_DIR is where the object files reside. Default to the current directory +OBJ_DIR?=. -ifeq ($(MAKECMDGOALS),clean) - DEBUG_LEVEL=0 -endif +# Check the MAKECMDGOALS to set the DEBUG_LEVEL and LIB_MODE appropriately -ifeq ($(MAKECMDGOALS),release) +ifneq ($(filter clean release install, $(MAKECMDGOALS)),) DEBUG_LEVEL=0 endif - -ifeq ($(MAKECMDGOALS),shared_lib) - LIB_MODE=shared +ifneq ($(filter dbg, $(MAKECMDGOALS)),) + DEBUG_LEVEL=2 +else ifneq ($(filter shared_lib install-shared, $(MAKECMDGOALS)),) DEBUG_LEVEL=0 -endif - -ifeq ($(MAKECMDGOALS),install-shared) LIB_MODE=shared - DEBUG_LEVEL=0 -endif - -ifeq ($(MAKECMDGOALS),static_lib) +else ifneq ($(filter static_lib install-static, $(MAKECMDGOALS)),) DEBUG_LEVEL=0 LIB_MODE=static -endif - -ifeq ($(MAKECMDGOALS),install-static) - DEBUG_LEVEL=0 - LIB_MODE=static -endif - -ifeq ($(MAKECMDGOALS),install) - DEBUG_LEVEL=0 -endif - - -ifneq ($(findstring jtest, $(MAKECMDGOALS)),) +else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),) OBJ_DIR=jl LIB_MODE=shared -endif - -ifneq ($(findstring rocksdbjava, $(MAKECMDGOALS)),) - LIB_MODE=shared - ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),) + ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),) OBJ_DIR=jls - ifneq ($(DEBUG_LEVEL),2) - DEBUG_LEVEL=0 - endif - ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish) - DEBUG_LEVEL=0 - endif - else - OBJ_DIR=jl + ifneq ($(DEBUG_LEVEL),2) + DEBUG_LEVEL=0 + endif + ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish) + DEBUG_LEVEL=0 + endif endif endif @@ -190,6 +163,11 @@ else endif ifdef ASSERT_STATUS_CHECKED +# For ASC, turn off constructor elision, preventing the case where a constructor returned +# by a method may pass the ASC check if the status is checked in the inner method. Forcing +# the copy constructor to be invoked disables the optimization and will cause the calling method +# to check the status in order to prevent an error from being raised. +PLATFORM_CXXFLAGS += -fno-elide-constructors ifeq ($(filter -DROCKSDB_ASSERT_STATUS_CHECKED,$(OPT)),) OPT += -DROCKSDB_ASSERT_STATUS_CHECKED endif @@ -254,6 +232,8 @@ AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%. # Export some common variables that might have been passed as Make variables # instead of environment variables. dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ + export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \ + export LDFLAGS="$(EXTRA_LDFLAGS)"; \ export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \ export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \ export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \ @@ -264,6 +244,12 @@ dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ # this file is generated by the previous line to set build flags and sources include make_config.mk +ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk) +include $(ROCKSDB_PLUGIN_MKS) +ROCKSDB_PLUGIN_SOURCES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach source, $($(plugin)_SOURCES), plugin/$(plugin)/$(source))) +ROCKSDB_PLUGIN_HEADERS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach header, $($(plugin)_HEADERS), plugin/$(plugin)/$(header))) +PLATFORM_LDFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS)) + export JAVAC_ARGS CLEAN_FILES += make_config.mk rocksdb.pc @@ -414,6 +400,10 @@ ifdef TEST_UINT128_COMPAT PLATFORM_CCFLAGS += -DTEST_UINT128_COMPAT=1 PLATFORM_CXXFLAGS += -DTEST_UINT128_COMPAT=1 endif +ifdef ROCKSDB_MODIFY_NPHASH + PLATFORM_CCFLAGS += -DROCKSDB_MODIFY_NPHASH=1 + PLATFORM_CXXFLAGS += -DROCKSDB_MODIFY_NPHASH=1 +endif # This (the first rule) must depend on "all". default: all @@ -421,6 +411,10 @@ default: all WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \ -Wunused-parameter +ifeq (,$(filter amd64, $(MACHINE))) + C_WARNING_FLAGS = -Wstrict-prototypes +endif + ifdef USE_CLANG # Used by some teams in Facebook WARNING_FLAGS += -Wshift-sign-overflow @@ -465,38 +459,13 @@ ifeq ($(NO_THREEWAY_CRC32C), 1) CXXFLAGS += -DNO_THREEWAY_CRC32C endif -CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) +CFLAGS += $(C_WARNING_FLAGS) $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers LDFLAGS += $(PLATFORM_LDFLAGS) -# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but -# the file needs to already exist or else the build will fail -ifndef NO_UPDATE_BUILD_VERSION -date := $(shell date +%F) -ifdef FORCE_GIT_SHA - git_sha := $(FORCE_GIT_SHA) -else - git_sha := $(shell git rev-parse HEAD 2>/dev/null) -endif -gen_build_version = sed -e s/@@GIT_SHA@@/$(git_sha)/ -e s/@@GIT_DATE_TIME@@/$(date)/ util/build_version.cc.in - -# Record the version of the source that we are compiling. -# We keep a record of the git revision in this file. It is then built -# as a regular source file as part of the compilation process. -# One can run "strings executable_filename | grep _build_" to find -# the version of the source that we used to build the executable file. -FORCE: -util/build_version.cc: FORCE - $(AM_V_GEN)rm -f $@-t - $(AM_V_at)$(gen_build_version) > $@-t - $(AM_V_at)if test -f $@; then \ - cmp -s $@-t $@ && rm -f $@-t || mv -f $@-t $@; \ - else mv -f $@-t $@; fi -endif - -OBJ_DIR?=. LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES)) +LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES)) ifeq ($(HAVE_POWER8),1) LIB_OBJECTS += $(patsubst %.c, $(OBJ_DIR)/%.o, $(LIB_SOURCES_C)) LIB_OBJECTS += $(patsubst %.S, $(OBJ_DIR)/%.o, $(LIB_SOURCES_ASM)) @@ -506,6 +475,12 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES)) endif +# range_tree is not compatible with non GNU libc on ppc64 +# see https://jira.percona.com/browse/PS-7559 +ifneq ($(PPC_LIBC_IS_GNU),0) + LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES)) +endif + GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o TESTUTIL = $(OBJ_DIR)/test_util/testutil.o TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST) @@ -516,12 +491,14 @@ VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST) BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES)) +CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES)) TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES)) ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES)) STRESS_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES)) -ALL_SOURCES = $(LIB_SOURCES) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc -ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) +# Exclude build_version.cc -- a generated source file -- from all sources. Not needed for dependencies +ALL_SOURCES = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc +ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES) TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES))) @@ -532,160 +509,40 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) ALL_SOURCES += third-party/folly/folly/synchronization/test/DistributedMutexTest.cc endif -PARALLEL_TEST = \ - backupable_db_test \ - db_bloom_filter_test \ - db_compaction_filter_test \ - db_compaction_test \ - db_merge_operator_test \ - db_sst_test \ - db_test \ - db_test2 \ - db_universal_compaction_test \ - db_wal_test \ - column_family_test \ - external_sst_file_test \ - import_column_family_test \ - fault_injection_test \ - file_reader_writer_test \ - inlineskiplist_test \ - manual_compaction_test \ - persistent_cache_test \ - table_test \ - transaction_test \ - transaction_lock_mgr_test \ - write_prepared_transaction_test \ - write_unprepared_transaction_test \ - -ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) - TESTS += folly_synchronization_distributed_mutex_test - PARALLEL_TEST += folly_synchronization_distributed_mutex_test - TESTS_PASSING_ASC = folly_synchronization_distributed_mutex_test -endif - # options_settable_test doesn't pass with UBSAN as we use hack in the test ifdef COMPILE_WITH_UBSAN TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g') endif ifdef ASSERT_STATUS_CHECKED -# This is a new check for which we will add support incrementally. This -# list can be removed once support is fully added. - TESTS_PASSING_ASC = \ - arena_test \ - autovector_test \ - cache_test \ - lru_cache_test \ - blob_file_addition_test \ - blob_file_builder_test \ - blob_file_garbage_test \ - blob_file_reader_test \ - bloom_test \ - cassandra_format_test \ - cassandra_row_merge_test \ - cassandra_serialize_test \ - cleanable_test \ - coding_test \ - crc32c_test \ - dbformat_test \ - db_basic_test \ - db_with_timestamp_basic_test \ - db_with_timestamp_compaction_test \ - db_options_test \ - db_properties_test \ - db_secondary_test \ - options_file_test \ - defer_test \ - filename_test \ - dynamic_bloom_test \ - env_basic_test \ + # TODO: finish fixing all tests to pass this check + TESTS_FAILING_ASC = \ + c_test \ + db_test \ + db_test2 \ env_test \ - env_logger_test \ - event_logger_test \ - error_handler_fs_test \ - auto_roll_logger_test \ - file_indexer_test \ - flush_job_test \ - hash_table_test \ - hash_test \ - heap_test \ - histogram_test \ - inlineskiplist_test \ - io_posix_test \ - iostats_context_test \ - ldb_cmd_test \ - memkind_kmem_allocator_test \ - merger_test \ - mock_env_test \ - object_registry_test \ - prefix_test \ - repair_test \ - configurable_test \ - options_settable_test \ - options_test \ - random_test \ - range_del_aggregator_test \ - sst_file_reader_test \ - range_tombstone_fragmenter_test \ - repeatable_thread_test \ - skiplist_test \ - slice_test \ - sst_dump_test \ - statistics_test \ - stats_history_test \ - thread_local_test \ - trace_analyzer_test \ - env_timed_test \ - filelock_test \ - timer_queue_test \ - timer_test \ - options_util_test \ - persistent_cache_test \ - util_merge_operators_test \ - block_cache_trace_analyzer_test \ - block_cache_tracer_test \ - cache_simulator_test \ - sim_cache_test \ - version_builder_test \ - version_edit_test \ - work_queue_test \ - write_controller_test \ - compaction_iterator_test \ - compaction_job_test \ - compaction_job_stats_test \ - io_tracer_test \ - merge_helper_test \ - memtable_list_test \ - flush_job_test \ - block_based_filter_block_test \ - block_fetcher_test \ - full_filter_block_test \ - partitioned_filter_block_test \ - column_family_test \ - file_reader_writer_test \ - corruption_test \ - db_universal_compaction_test \ - import_column_family_test \ - memory_test \ - table_test \ + range_locking_test \ + testutil_test \ -ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) -TESTS_PASSING_ASC += folly_synchronization_distributed_mutex_test + # Since we have very few ASC exclusions left, excluding them from + # the build is the most convenient way to exclude them from testing + TESTS := $(filter-out $(TESTS_FAILING_ASC),$(TESTS)) endif - # Enable building all unit tests, but use check_some to run only tests - # known to pass ASC (ASSERT_STATUS_CHECKED) - ROCKSDBTESTS_SUBSET ?= $(TESTS_PASSING_ASC) - # Alternate: only build unit tests known to pass ASC, and run them - # with make check - #TESTS := $(filter $(TESTS_PASSING_ASC),$(TESTS)) - #PARALLEL_TEST := $(filter $(TESTS_PASSING_ASC),$(PARALLEL_TEST)) -else - ROCKSDBTESTS_SUBSET ?= $(TESTS) -endif +ROCKSDBTESTS_SUBSET ?= $(TESTS) + +# env_test - suspicious use of test::TmpDir +# deletefile_test - serial because it generates giant temporary files in +# its various tests. Parallel can fill up your /dev/shm +NON_PARALLEL_TEST = \ + env_test \ + deletefile_test \ + +PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS)) + # Not necessarily well thought out or up-to-date, but matches old list TESTS_PLATFORM_DEPENDENT := \ db_basic_test \ - db_with_timestamp_basic_test \ + db_blob_basic_test \ db_encryption_test \ db_test2 \ external_sst_file_basic_test \ @@ -703,6 +560,7 @@ TESTS_PLATFORM_DEPENDENT := \ io_posix_test \ hash_test \ random_test \ + ribbon_test \ thread_local_test \ work_queue_test \ rate_limiter_test \ @@ -768,13 +626,44 @@ else LIBRARY=$(STATIC_LIBRARY) TEST_LIBRARY=$(STATIC_TEST_LIBRARY) TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY) -STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY) endif +STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY) ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) +# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but +# the file needs to already exist or else the build will fail +ifndef NO_UPDATE_BUILD_VERSION + +# By default, use the current date-time as the date. If there are no changes, +# we will use the last commit date instead. +build_date := $(shell date "+%Y-%m-%d %T") + +ifdef FORCE_GIT_SHA + git_sha := $(FORCE_GIT_SHA) + git_mod := 1 + git_date := $(build_date) +else + git_sha := $(shell git rev-parse HEAD 2>/dev/null) + git_tag := $(shell git symbolic-ref -q --short HEAD 2> /dev/null || git describe --tags --exact-match 2>/dev/null) + git_mod := $(shell git diff-index HEAD --quiet 2>/dev/null; echo $$?) + git_date := $(shell git log -1 --date=format:"%Y-%m-%d %T" --format="%ad" 2>/dev/null) +endif +gen_build_version = sed -e s/@GIT_SHA@/$(git_sha)/ -e s:@GIT_TAG@:"$(git_tag)": -e s/@GIT_MOD@/"$(git_mod)"/ -e s/@BUILD_DATE@/"$(build_date)"/ -e s/@GIT_DATE@/"$(git_date)"/ util/build_version.cc.in + +# Record the version of the source that we are compiling. +# We keep a record of the git revision in this file. It is then built +# as a regular source file as part of the compilation process. +# One can run "strings executable_filename | grep _build_" to find +# the version of the source that we used to build the executable file. +util/build_version.cc: $(filter-out $(OBJ_DIR)/util/build_version.o, $(LIB_OBJECTS)) util/build_version.cc.in + $(AM_V_GEN)rm -f $@-t + $(AM_V_at)$(gen_build_version) > $@ +endif +CLEAN_FILES += util/build_version.cc + default: all #----------------------------------------------- @@ -822,7 +711,8 @@ endif # PLATFORM_SHARED_EXT analyze tools tools_lib \ blackbox_crash_test_with_atomic_flush whitebox_crash_test_with_atomic_flush \ blackbox_crash_test_with_txn whitebox_crash_test_with_txn \ - blackbox_crash_test_with_best_efforts_recovery + blackbox_crash_test_with_best_efforts_recovery \ + blackbox_crash_test_with_ts whitebox_crash_test_with_ts all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS) @@ -938,7 +828,7 @@ gen_parallel_tests: # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest # slow_test_regexp = \ - ^.*SnapshotConcurrentAccessTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$ + ^.*SnapshotConcurrentAccessTest.*$$|^.*SeqAdvanceConcurrentTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$ prioritize_long_running_tests = \ perl -pe 's,($(slow_test_regexp)),100 $$1,' \ | sort -k1,1gr \ @@ -1060,6 +950,8 @@ crash_test_with_txn: whitebox_crash_test_with_txn blackbox_crash_test_with_txn crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery +crash_test_with_ts: whitebox_crash_test_with_ts blackbox_crash_test_with_ts + blackbox_crash_test: db_stress $(PYTHON) -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS) $(PYTHON) -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS) @@ -1073,6 +965,9 @@ blackbox_crash_test_with_txn: db_stress blackbox_crash_test_with_best_efforts_recovery: db_stress $(PYTHON) -u tools/db_crashtest.py --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS) +blackbox_crash_test_with_ts: db_stress + $(PYTHON) -u tools/db_crashtest.py --enable_ts blackbox $(CRASH_TEST_EXT_ARGS) + ifeq ($(CRASH_TEST_KILL_ODD),) CRASH_TEST_KILL_ODD=888887 endif @@ -1091,6 +986,10 @@ whitebox_crash_test_with_txn: db_stress $(PYTHON) -u tools/db_crashtest.py --txn whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) +whitebox_crash_test_with_ts: db_stress + $(PYTHON) -u tools/db_crashtest.py --enable_ts whitebox --random_kill_odd \ + $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) + asan_check: clean COMPILE_WITH_ASAN=1 $(MAKE) check -j32 $(MAKE) clean @@ -1236,8 +1135,9 @@ analyze_incremental: $(MAKE) dbg CLEAN_FILES += unity.cc -unity.cc: Makefile +unity.cc: Makefile util/build_version.cc.in rm -f $@ $@-t + $(AM_V_at)$(gen_build_version) > util/build_version.cc for source_file in $(LIB_SOURCES); do \ echo "#include \"$$source_file\"" >> $@-t; \ done @@ -1317,11 +1217,11 @@ $(STATIC_TEST_LIBRARY): $(TEST_OBJECTS) $(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY) $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ -$(STATIC_TOOLS_LIBRARY): $(BENCH_OBJECTS) $(TOOL_OBJECTS) +$(STATIC_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(AM_V_AR)rm -f $@ $(SHARED_TOOLS_LIBRARY) $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ -$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) +$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(AM_V_AR)rm -f $@ $(SHARED_STRESS_LIBRARY) $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ @@ -1333,7 +1233,7 @@ $(SHARED_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(SHARED1) $(AM_V_AR)rm -f $@ $(STATIC_TOOLS_LIBRARY) $(AM_SHARE) -$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(SHARED_TOOLS_LIBRARY) $(SHARED1) +$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHARED_TOOLS_LIBRARY) $(SHARED1) $(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY) $(AM_SHARE) @@ -1355,7 +1255,7 @@ folly_synchronization_distributed_mutex_test: $(OBJ_DIR)/third-party/folly/folly $(AM_LINK) endif -cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(LIBRARY) +cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY) $(AM_LINK) persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY) @@ -1415,6 +1315,9 @@ hash_test: $(OBJ_DIR)/util/hash_test.o $(TEST_LIBRARY) $(LIBRARY) random_test: $(OBJ_DIR)/util/random_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +ribbon_test: $(OBJ_DIR)/util/ribbon_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + option_change_migration_test: $(OBJ_DIR)/utilities/option_change_migration/option_change_migration_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1460,6 +1363,12 @@ slice_transform_test: $(OBJ_DIR)/util/slice_transform_test.o $(TEST_LIBRARY) $(L db_basic_test: $(OBJ_DIR)/db/db_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_blob_basic_test: $(OBJ_DIR)/db/blob/db_blob_basic_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_blob_compaction_test: $(OBJ_DIR)/db/blob/db_blob_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1508,6 +1417,9 @@ db_inplace_update_test: $(OBJ_DIR)/db/db_inplace_update_test.o $(TEST_LIBRARY) $ db_iterator_test: $(OBJ_DIR)/db/db_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_kv_checksum_test: $(OBJ_DIR)/db/db_kv_checksum_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_memtable_test: $(OBJ_DIR)/db/db_memtable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1630,6 +1542,9 @@ compaction_job_test: $(OBJ_DIR)/db/compaction/compaction_job_test.o $(TEST_LIBRA compaction_job_stats_test: $(OBJ_DIR)/db/compaction/compaction_job_stats_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +compaction_service_test: $(OBJ_DIR)/db/compaction/compaction_service_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + compact_on_deletion_collector_test: $(OBJ_DIR)/utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1783,6 +1698,9 @@ compact_files_test: $(OBJ_DIR)/db/compact_files_test.o $(TEST_LIBRARY) $(LIBRARY configurable_test: options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +customizable_test: options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + options_test: $(OBJ_DIR)/options/options_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1834,7 +1752,7 @@ write_callback_test: $(OBJ_DIR)/db/write_callback_test.o $(TEST_LIBRARY) $(LIBRA heap_test: $(OBJ_DIR)/util/heap_test.o $(GTEST) $(AM_LINK) -transaction_lock_mgr_test: utilities/transactions/transaction_lock_mgr_test.o $(TEST_LIBRARY) $(LIBRARY) +point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY) @@ -1903,13 +1821,16 @@ blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBR repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) sst_file_reader_test: $(OBJ_DIR)/table/sst_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) -db_secondary_test: $(OBJ_DIR)/db/db_impl/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY) +db_secondary_test: $(OBJ_DIR)/db/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) block_cache_tracer_test: $(OBJ_DIR)/trace_replay/block_cache_tracer_test.o $(TEST_LIBRARY) $(LIBRARY) @@ -1927,6 +1848,9 @@ blob_file_addition_test: $(OBJ_DIR)/db/blob/blob_file_addition_test.o $(TEST_LIB blob_file_builder_test: $(OBJ_DIR)/db/blob/blob_file_builder_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +blob_file_cache_test: $(OBJ_DIR)/db/blob/blob_file_cache_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + blob_file_garbage_test: $(OBJ_DIR)/db/blob/blob_file_garbage_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1954,6 +1878,15 @@ io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY) $(AM_LINK) +db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +db_write_buffer_manager_test: $(OBJ_DIR)/db/db_write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + +clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + #------------------------------------------------- # make install related stuff PREFIX ?= /usr/local @@ -1978,6 +1911,10 @@ install-headers: gen-pc for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \ install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/$$header; \ done + for header in $(ROCKSDB_PLUGIN_HEADERS); do \ + install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \ + install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \ + done install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc install-static: install-headers $(LIBRARY) @@ -2061,11 +1998,11 @@ BZIP2_DOWNLOAD_BASE ?= https://sourceware.org/pub/bzip2 SNAPPY_VER ?= 1.1.8 SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive -LZ4_VER ?= 1.9.2 -LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc +LZ4_VER ?= 1.9.3 +LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive -ZSTD_VER ?= 1.4.4 -ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8 +ZSTD_VER ?= 1.4.9 +ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 @@ -2097,80 +2034,80 @@ ifeq ($(PLATFORM), OS_AIX) SNAPPY_MAKE_TARGET = libsnappy.la endif ifeq ($(PLATFORM), OS_OPENBSD) - JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd + JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so - ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar endif -libz.a: - -rm -rf zlib-$(ZLIB_VER) -ifeq (,$(wildcard ./zlib-$(ZLIB_VER).tar.gz)) +zlib-$(ZLIB_VER).tar.gz: curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz -endif ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \ echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \ exit 1; \ fi + +libz.a: zlib-$(ZLIB_VER).tar.gz + -rm -rf zlib-$(ZLIB_VER) tar xvzf zlib-$(ZLIB_VER).tar.gz - cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${EXTRA_CFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --static && $(MAKE) + cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static && $(MAKE) cp zlib-$(ZLIB_VER)/libz.a . -libbz2.a: - -rm -rf bzip2-$(BZIP2_VER) -ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz)) +bzip2-$(BZIP2_VER).tar.gz: curl --fail --output bzip2-$(BZIP2_VER).tar.gz --location ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz -endif BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \ echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \ exit 1; \ fi + +libbz2.a: bzip2-$(BZIP2_VER).tar.gz + -rm -rf bzip2-$(BZIP2_VER) tar xvzf bzip2-$(BZIP2_VER).tar.gz - cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${EXTRA_CFLAGS}' AR='ar ${EXTRA_ARFLAGS}' + cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' AR='ar ${EXTRA_ARFLAGS}' cp bzip2-$(BZIP2_VER)/libbz2.a . -libsnappy.a: - -rm -rf snappy-$(SNAPPY_VER) -ifeq (,$(wildcard ./snappy-$(SNAPPY_VER).tar.gz)) +snappy-$(SNAPPY_VER).tar.gz: curl --fail --output snappy-$(SNAPPY_VER).tar.gz --location ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz -endif SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \ echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \ exit 1; \ fi + +libsnappy.a: snappy-$(SNAPPY_VER).tar.gz + -rm -rf snappy-$(SNAPPY_VER) tar xvzf snappy-$(SNAPPY_VER).tar.gz mkdir snappy-$(SNAPPY_VER)/build - cd snappy-$(SNAPPY_VER)/build && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && $(MAKE) ${SNAPPY_MAKE_TARGET} + cd snappy-$(SNAPPY_VER)/build && CFLAGS='${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET} cp snappy-$(SNAPPY_VER)/build/libsnappy.a . -liblz4.a: - -rm -rf lz4-$(LZ4_VER) -ifeq (,$(wildcard ./lz4-$(LZ4_VER).tar.gz)) +lz4-$(LZ4_VER).tar.gz: curl --fail --output lz4-$(LZ4_VER).tar.gz --location ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz -endif LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \ echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \ exit 1; \ fi + +liblz4.a: lz4-$(LZ4_VER).tar.gz + -rm -rf lz4-$(LZ4_VER) tar xvzf lz4-$(LZ4_VER).tar.gz - cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all + cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' all cp lz4-$(LZ4_VER)/lib/liblz4.a . -libzstd.a: - -rm -rf zstd-$(ZSTD_VER) -ifeq (,$(wildcard ./zstd-$(ZSTD_VER).tar.gz)) +zstd-$(ZSTD_VER).tar.gz: curl --fail --output zstd-$(ZSTD_VER).tar.gz --location ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz -endif ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \ echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \ exit 1; \ fi + +libzstd.a: zstd-$(ZSTD_VER).tar.gz + -rm -rf zstd-$(ZSTD_VER) tar xvzf zstd-$(ZSTD_VER).tar.gz - cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' install + cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' libzstd.a cp zstd-$(ZSTD_VER)/lib/libzstd.a . # A version of each $(LIB_OBJECTS) compiled with -fPIC and a fixed set of static compression libraries @@ -2179,14 +2116,23 @@ JAVA_COMPRESSIONS = libz.a libbz2.a libsnappy.a liblz4.a libzstd.a endif JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD -JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib/include -ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),) +JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./snappy-$(SNAPPY_VER)/build -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib -I./zstd-$(ZSTD_VER)/lib/dictBuilder + +ifneq ($(findstring rocksdbjavastatic, $(filter-out rocksdbjavastatic_deps, $(MAKECMDGOALS))),) CXXFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) +CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) +endif +rocksdbjavastatic: +ifeq ($(JAVA_HOME),) + $(error JAVA_HOME is not set) endif -rocksdbjavastatic: $(LIB_OBJECTS) $(JAVA_COMPRESSIONS) - cd java;$(MAKE) javalib; - rm -f ./java/target/$(ROCKSDBJNILIB) + $(MAKE) rocksdbjavastatic_deps + $(MAKE) rocksdbjavastatic_libobjects + $(MAKE) rocksdbjavastatic_javalib + +rocksdbjavastatic_javalib: + cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib + rm -f java/target/$(ROCKSDBJNILIB) $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \ -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \ $(LIB_OBJECTS) $(COVERAGEFLAGS) \ @@ -2203,6 +2149,10 @@ rocksdbjavastatic: $(LIB_OBJECTS) $(JAVA_COMPRESSIONS) openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1 openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1 +rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS) + +rocksdbjavastatic_libobjects: $(LIB_OBJECTS) + rocksdbjavastaticrelease: rocksdbjavastatic cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md @@ -2218,7 +2168,7 @@ rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 roc rocksdbjavastaticdockerx86: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerx86_64: mkdir -p java/target @@ -2234,7 +2184,7 @@ rocksdbjavastaticdockerarm64v8: rocksdbjavastaticdockerx86musl: mkdir -p java/target - docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerx86_64musl: mkdir -p java/target @@ -2272,7 +2222,10 @@ jl/%.o: %.cc $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS) rocksdbjava: $(LIB_OBJECTS) - $(AM_V_GEN)cd java;$(MAKE) javalib; +ifeq ($(JAVA_HOME),) + $(error JAVA_HOME is not set) +endif + $(AM_V_GEN)cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib; $(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB) $(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) $(AM_V_at)cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md @@ -2284,13 +2237,13 @@ jclean: cd java;$(MAKE) clean; jtest_compile: rocksdbjava - cd java;$(MAKE) java_test + cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) java_test jtest_run: cd java;$(MAKE) run_test jtest: rocksdbjava - cd java;$(MAKE) sample;$(MAKE) test; + cd java;$(MAKE) sample; SHA256_CMD='$(SHA256_CMD)' $(MAKE) test; $(PYTHON) tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets jdb_bench: @@ -2349,12 +2302,14 @@ endif # --------------------------------------------------------------------------- # Source files dependencies detection # --------------------------------------------------------------------------- - +# If skip dependencies is ON, skip including the dep files +ifneq ($(SKIP_DEPENDS), 1) DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES)) DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C)) ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES)) endif +endif # Add proper dependency support so changing a .h file forces a .cc file to # rebuild. @@ -2394,20 +2349,9 @@ endif build_subset_tests: $(ROCKSDBTESTS_SUBSET) $(AM_V_GEN)if [ -n "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}" ]; then echo "$(ROCKSDBTESTS_SUBSET)" > "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}"; else echo "$(ROCKSDBTESTS_SUBSET)"; fi -# if the make goal is either "clean" or "format", we shouldn't -# try to import the *.d files. -# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly -# working solution. -ifneq ($(MAKECMDGOALS),clean) -ifneq ($(MAKECMDGOALS),format) -ifneq ($(MAKECMDGOALS),jclean) -ifneq ($(MAKECMDGOALS),jtest) -ifneq ($(MAKECMDGOALS),package) -ifneq ($(MAKECMDGOALS),analyze) +# Remove the rules for which dependencies should not be generated and see if any are left. +#If so, include the dependencies; if not, do not include the dependency files +ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test, $(MAKECMDGOALS)) +ifneq ("$(ROCKS_DEP_RULES)", "") -include $(DEPFILES) endif -endif -endif -endif -endif -endif diff --git a/PLUGINS.md b/PLUGINS.md new file mode 100644 index 00000000000..ec30b12f15b --- /dev/null +++ b/PLUGINS.md @@ -0,0 +1,4 @@ +This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it. + +* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference +* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices diff --git a/TARGETS b/TARGETS index 96da3a19731..563252a39d3 100644 --- a/TARGETS +++ b/TARGETS @@ -1,4 +1,5 @@ -# This file @generated by `python3 buckifier/buckify_rocksdb.py` +# This file @generated by: +#$ python3 buckifier/buckify_rocksdb.py # --> DO NOT EDIT MANUALLY <-- # This file is a Facebook-specific integration for buck builds, so can # only be validated by Facebook employees. @@ -9,7 +10,7 @@ load(":defs.bzl", "test_binary") REPO_PATH = package_name() + "/" -ROCKSDB_COMPILER_FLAGS = [ +ROCKSDB_COMPILER_FLAGS_0 = [ "-fno-builtin-memcmp", # Needed to compile in fbcode "-Wno-expansion-to-defined", @@ -24,10 +25,10 @@ ROCKSDB_EXTERNAL_DEPS = [ ("zlib", None, "z"), ("gflags", None, "gflags"), ("lz4", None, "lz4"), - ("zstd", None), + ("zstd", None, "zstd"), ] -ROCKSDB_OS_DEPS = [ +ROCKSDB_OS_DEPS_0 = [ ( "linux", ["third-party//numa:numa", "third-party//liburing:uring", "third-party//tbb:tbb"], @@ -38,7 +39,7 @@ ROCKSDB_OS_DEPS = [ ), ] -ROCKSDB_OS_PREPROCESSOR_FLAGS = [ +ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [ ( "linux", [ @@ -86,10 +87,12 @@ ROCKSDB_PREPROCESSOR_FLAGS = [ # Added missing flags from output of build_detect_platform "-DROCKSDB_BACKTRACE", +] - # Directories with files for #include - "-I" + REPO_PATH + "include/", - "-I" + REPO_PATH, +# Directories with files for #include +ROCKSDB_INCLUDE_PATHS = [ + "", + "include", ] ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { @@ -104,18 +107,18 @@ is_opt_mode = build_mode.startswith("opt") # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice # doesn't harm and avoid forgetting to add it. -ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else []) +ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else []) sanitizer = read_config("fbcode", "sanitizer") # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( +ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([( "linux", ["-DROCKSDB_JEMALLOC"], )] if sanitizer == "" else []) -ROCKSDB_OS_DEPS += ([( +ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([( "linux", ["third-party//jemalloc:headers"], )] if sanitizer == "" else []) @@ -129,6 +132,7 @@ cpp_library( name = "rocksdb_lib", srcs = [ "cache/cache.cc", + "cache/cache_entry_roles.cc", "cache/clock_cache.cc", "cache/lru_cache.cc", "cache/sharded_cache.cc", @@ -148,8 +152,10 @@ cpp_library( "cloud/manifest_reader.cc", "cloud/purge.cc", "db/arena_wrapped_db_iter.cc", + "db/blob/blob_fetcher.cc", "db/blob/blob_file_addition.cc", "db/blob/blob_file_builder.cc", + "db/blob/blob_file_cache.cc", "db/blob/blob_file_garbage.cc", "db/blob/blob_file_meta.cc", "db/blob/blob_file_reader.cc", @@ -159,7 +165,6 @@ cpp_library( "db/builder.cc", "db/c.cc", "db/column_family.cc", - "db/compacted_db_impl.cc", "db/compaction/compaction.cc", "db/compaction/compaction_iterator.cc", "db/compaction/compaction_job.cc", @@ -170,6 +175,7 @@ cpp_library( "db/compaction/sst_partitioner.cc", "db/convenience.cc", "db/db_filesnapshot.cc", + "db/db_impl/compacted_db_impl.cc", "db/db_impl/db_impl.cc", "db/db_impl/db_impl_compaction_flush.cc", "db/db_impl/db_impl_debug.cc", @@ -221,6 +227,7 @@ cpp_library( "db/write_batch_base.cc", "db/write_controller.cc", "db/write_thread.cc", + "env/composite_env.cc", "env/env.cc", "env/env_chroot.cc", "env/env_encryption.cc", @@ -229,12 +236,14 @@ cpp_library( "env/file_system.cc", "env/file_system_tracer.cc", "env/fs_posix.cc", + "env/fs_remap.cc", "env/io_posix.cc", "env/mock_env.cc", "file/delete_scheduler.cc", "file/file_prefetch_buffer.cc", "file/file_util.cc", "file/filename.cc", + "file/line_file_reader.cc", "file/random_access_file_reader.cc", "file/read_write_util.cc", "file/readahead_raf.cc", @@ -270,6 +279,7 @@ cpp_library( "monitoring/thread_status_util_debug.cc", "options/cf_options.cc", "options/configurable.cc", + "options/customizable.cc", "options/db_options.cc", "options/options.cc", "options/options_helper.cc", @@ -348,12 +358,14 @@ cpp_library( "util/compression_context_cache.cc", "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", + "util/crc32c_arm64.cc", "util/dynamic_bloom.cc", "util/file_checksum_helper.cc", "util/hash.cc", "util/murmurhash.cc", "util/random.cc", "util/rate_limiter.cc", + "util/ribbon_config.cc", "util/slice.cc", "util/status.cc", "util/string_util.cc", @@ -399,8 +411,23 @@ cpp_library( "utilities/simulator_cache/sim_cache.cc", "utilities/table_properties_collectors/compact_on_deletion_collector.cc", "utilities/trace/file_trace_reader_writer.cc", - "utilities/transactions/lock/lock_tracker.cc", - "utilities/transactions/lock/point_lock_tracker.cc", + "utilities/transactions/lock/lock_manager.cc", + "utilities/transactions/lock/point/point_lock_manager.cc", + "utilities/transactions/lock/point/point_lock_tracker.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc", + "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc", + "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc", + "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc", "utilities/transactions/optimistic_transaction.cc", "utilities/transactions/optimistic_transaction_db_impl.cc", "utilities/transactions/pessimistic_transaction.cc", @@ -408,7 +435,6 @@ cpp_library( "utilities/transactions/snapshot_checker.cc", "utilities/transactions/transaction_base.cc", "utilities/transactions/transaction_db_mutex_impl.cc", - "utilities/transactions/transaction_lock_mgr.cc", "utilities/transactions/transaction_util.cc", "utilities/transactions/write_prepared_txn.cc", "utilities/transactions/write_prepared_txn_db.cc", @@ -424,6 +450,7 @@ cpp_library( os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [], external_deps = ROCKSDB_EXTERNAL_DEPS, link_whole = False, @@ -433,6 +460,7 @@ cpp_library( name = "rocksdb_whole_archive_lib", srcs = [ "cache/cache.cc", + "cache/cache_entry_roles.cc", "cache/clock_cache.cc", "cache/lru_cache.cc", "cache/sharded_cache.cc", @@ -452,8 +480,10 @@ cpp_library( "cloud/manifest_reader.cc", "cloud/purge.cc", "db/arena_wrapped_db_iter.cc", + "db/blob/blob_fetcher.cc", "db/blob/blob_file_addition.cc", "db/blob/blob_file_builder.cc", + "db/blob/blob_file_cache.cc", "db/blob/blob_file_garbage.cc", "db/blob/blob_file_meta.cc", "db/blob/blob_file_reader.cc", @@ -463,7 +493,6 @@ cpp_library( "db/builder.cc", "db/c.cc", "db/column_family.cc", - "db/compacted_db_impl.cc", "db/compaction/compaction.cc", "db/compaction/compaction_iterator.cc", "db/compaction/compaction_job.cc", @@ -474,6 +503,7 @@ cpp_library( "db/compaction/sst_partitioner.cc", "db/convenience.cc", "db/db_filesnapshot.cc", + "db/db_impl/compacted_db_impl.cc", "db/db_impl/db_impl.cc", "db/db_impl/db_impl_compaction_flush.cc", "db/db_impl/db_impl_debug.cc", @@ -525,6 +555,7 @@ cpp_library( "db/write_batch_base.cc", "db/write_controller.cc", "db/write_thread.cc", + "env/composite_env.cc", "env/env.cc", "env/env_chroot.cc", "env/env_encryption.cc", @@ -533,12 +564,14 @@ cpp_library( "env/file_system.cc", "env/file_system_tracer.cc", "env/fs_posix.cc", + "env/fs_remap.cc", "env/io_posix.cc", "env/mock_env.cc", "file/delete_scheduler.cc", "file/file_prefetch_buffer.cc", "file/file_util.cc", "file/filename.cc", + "file/line_file_reader.cc", "file/random_access_file_reader.cc", "file/read_write_util.cc", "file/readahead_raf.cc", @@ -574,6 +607,7 @@ cpp_library( "monitoring/thread_status_util_debug.cc", "options/cf_options.cc", "options/configurable.cc", + "options/customizable.cc", "options/db_options.cc", "options/options.cc", "options/options_helper.cc", @@ -652,12 +686,14 @@ cpp_library( "util/compression_context_cache.cc", "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", + "util/crc32c_arm64.cc", "util/dynamic_bloom.cc", "util/file_checksum_helper.cc", "util/hash.cc", "util/murmurhash.cc", "util/random.cc", "util/rate_limiter.cc", + "util/ribbon_config.cc", "util/slice.cc", "util/status.cc", "util/string_util.cc", @@ -703,8 +739,23 @@ cpp_library( "utilities/simulator_cache/sim_cache.cc", "utilities/table_properties_collectors/compact_on_deletion_collector.cc", "utilities/trace/file_trace_reader_writer.cc", - "utilities/transactions/lock/lock_tracker.cc", - "utilities/transactions/lock/point_lock_tracker.cc", + "utilities/transactions/lock/lock_manager.cc", + "utilities/transactions/lock/point/point_lock_manager.cc", + "utilities/transactions/lock/point/point_lock_tracker.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc", + "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc", + "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc", + "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc", + "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc", + "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc", "utilities/transactions/optimistic_transaction.cc", "utilities/transactions/optimistic_transaction_db_impl.cc", "utilities/transactions/pessimistic_transaction.cc", @@ -712,7 +763,6 @@ cpp_library( "utilities/transactions/snapshot_checker.cc", "utilities/transactions/transaction_base.cc", "utilities/transactions/transaction_db_mutex_impl.cc", - "utilities/transactions/transaction_lock_mgr.cc", "utilities/transactions/transaction_util.cc", "utilities/transactions/write_prepared_txn.cc", "utilities/transactions/write_prepared_txn_db.cc", @@ -728,6 +778,7 @@ cpp_library( os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [], external_deps = ROCKSDB_EXTERNAL_DEPS, link_whole = True, @@ -751,6 +802,7 @@ cpp_library( os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [":rocksdb_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS + [ ("googletest", None, "gtest"), @@ -764,6 +816,7 @@ cpp_library( "test_util/testutil.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/db_bench_tool.cc", + "tools/simulated_hybrid_file_system.cc", "tools/trace_analyzer_tool.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, @@ -772,6 +825,22 @@ cpp_library( os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + deps = [":rocksdb_lib"], + external_deps = ROCKSDB_EXTERNAL_DEPS, + link_whole = False, +) + +cpp_library( + name = "rocksdb_cache_bench_tools_lib", + srcs = ["cache/cache_bench_tool.cc"], + auto_headers = AutoHeaders.RECURSIVE_GLOB, + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [":rocksdb_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, link_whole = False, @@ -799,30 +868,30 @@ cpp_library( os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = ROCKSDB_LIB_DEPS, external_deps = ROCKSDB_EXTERNAL_DEPS, ) -if not is_opt_mode: - cpp_binary( - name = "c_test_bin", - srcs = ["db/c_test.c"], - arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, - os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, - compiler_flags = ROCKSDB_COMPILER_FLAGS, - preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [":rocksdb_test_lib"], - ) +cpp_binary( + name = "c_test_bin", + srcs = ["db/c_test.c"], + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + deps = [":rocksdb_test_lib"], +) if not is_opt_mode else None -if not is_opt_mode: - custom_unittest( - "c_test", - command = [ - native.package_name() + "/buckifier/rocks_test_runner.sh", - "$(location :{})".format("c_test_bin"), - ], - type = "simple", - ) +custom_unittest( + name = "c_test", + command = [ + native.package_name() + "/buckifier/rocks_test_runner.sh", + "$(location :{})".format("c_test_bin"), + ], + type = "simple", +) if not is_opt_mode else None cpp_library( name = "env_basic_test_lib", @@ -833,6 +902,7 @@ cpp_library( os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [":rocksdb_test_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, link_whole = False, @@ -843,21 +913,21 @@ ROCKS_TESTS = [ [ "arena_test", "memory/arena_test.cc", - "serial", + "parallel", [], [], ], [ "auto_roll_logger_test", "logging/auto_roll_logger_test.cc", - "serial", + "parallel", [], [], ], [ "autovector_test", "util/autovector_test.cc", - "serial", + "parallel", [], [], ], @@ -871,140 +941,154 @@ ROCKS_TESTS = [ [ "blob_db_test", "utilities/blob_db/blob_db_test.cc", - "serial", + "parallel", [], [], ], [ "blob_file_addition_test", "db/blob/blob_file_addition_test.cc", - "serial", + "parallel", [], [], ], [ "blob_file_builder_test", "db/blob/blob_file_builder_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "blob_file_cache_test", + "db/blob/blob_file_cache_test.cc", + "parallel", [], [], ], [ "blob_file_garbage_test", "db/blob/blob_file_garbage_test.cc", - "serial", + "parallel", [], [], ], [ "blob_file_reader_test", "db/blob/blob_file_reader_test.cc", - "serial", + "parallel", [], [], ], [ "block_based_filter_block_test", "table/block_based/block_based_filter_block_test.cc", - "serial", + "parallel", [], [], ], [ "block_based_table_reader_test", "table/block_based/block_based_table_reader_test.cc", - "serial", + "parallel", [], [], ], [ "block_cache_trace_analyzer_test", "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc", - "serial", + "parallel", [], [], ], [ "block_cache_tracer_test", "trace_replay/block_cache_tracer_test.cc", - "serial", + "parallel", [], [], ], [ "block_fetcher_test", "table/block_fetcher_test.cc", - "serial", + "parallel", [], [], ], [ "block_test", "table/block_based/block_test.cc", - "serial", + "parallel", [], [], ], [ "bloom_test", "util/bloom_test.cc", - "serial", + "parallel", [], [], ], [ "cache_simulator_test", "utilities/simulator_cache/cache_simulator_test.cc", - "serial", + "parallel", [], [], ], [ "cache_test", "cache/cache_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_format_test", "utilities/cassandra/cassandra_format_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_functional_test", "utilities/cassandra/cassandra_functional_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_row_merge_test", "utilities/cassandra/cassandra_row_merge_test.cc", - "serial", + "parallel", [], [], ], [ "cassandra_serialize_test", "utilities/cassandra/cassandra_serialize_test.cc", - "serial", + "parallel", [], [], ], [ "checkpoint_test", "utilities/checkpoint/checkpoint_test.cc", - "serial", + "parallel", [], [], ], [ "cleanable_test", "table/cleanable_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "clipping_iterator_test", + "db/compaction/clipping_iterator_test.cc", + "parallel", [], [], ], @@ -1025,7 +1109,7 @@ ROCKS_TESTS = [ [ "coding_test", "util/coding_test.cc", - "serial", + "parallel", [], [], ], @@ -1039,119 +1123,154 @@ ROCKS_TESTS = [ [ "compact_files_test", "db/compact_files_test.cc", - "serial", + "parallel", [], [], ], [ "compact_on_deletion_collector_test", "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_iterator_test", "db/compaction/compaction_iterator_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_job_stats_test", "db/compaction/compaction_job_stats_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_job_test", "db/compaction/compaction_job_test.cc", - "serial", + "parallel", [], [], ], [ "compaction_picker_test", "db/compaction/compaction_picker_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "compaction_service_test", + "db/compaction/compaction_service_test.cc", + "parallel", [], [], ], [ "comparator_db_test", "db/comparator_db_test.cc", - "serial", + "parallel", [], [], ], [ "configurable_test", "options/configurable_test.cc", - "serial", + "parallel", [], [], ], [ "corruption_test", "db/corruption_test.cc", - "serial", + "parallel", [], [], ], [ "crc32c_test", "util/crc32c_test.cc", - "serial", + "parallel", [], [], ], [ "cuckoo_table_builder_test", "table/cuckoo/cuckoo_table_builder_test.cc", - "serial", + "parallel", [], [], ], [ "cuckoo_table_db_test", "db/cuckoo_table_db_test.cc", - "serial", + "parallel", [], [], ], [ "cuckoo_table_reader_test", "table/cuckoo/cuckoo_table_reader_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "customizable_test", + "options/customizable_test.cc", + "parallel", [], [], ], [ "data_block_hash_index_test", "table/block_based/data_block_hash_index_test.cc", - "serial", + "parallel", [], [], ], [ "db_basic_test", "db/db_basic_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "db_blob_basic_test", + "db/blob/db_blob_basic_test.cc", + "parallel", + [], + [], + ], + [ + "db_blob_compaction_test", + "db/blob/db_blob_compaction_test.cc", + "parallel", + [], + [], + ], + [ + "db_blob_corruption_test", + "db/blob/db_blob_corruption_test.cc", + "parallel", [], [], ], [ "db_blob_index_test", "db/blob/db_blob_index_test.cc", - "serial", + "parallel", [], [], ], [ "db_block_cache_test", "db/db_block_cache_test.cc", - "serial", + "parallel", [], [], ], @@ -1186,84 +1305,91 @@ ROCKS_TESTS = [ [ "db_dynamic_level_test", "db/db_dynamic_level_test.cc", - "serial", + "parallel", [], [], ], [ "db_encryption_test", "db/db_encryption_test.cc", - "serial", + "parallel", [], [], ], [ "db_flush_test", "db/db_flush_test.cc", - "serial", + "parallel", [], [], ], [ "db_inplace_update_test", "db/db_inplace_update_test.cc", - "serial", + "parallel", [], [], ], [ "db_io_failure_test", "db/db_io_failure_test.cc", - "serial", + "parallel", [], [], ], [ "db_iter_stress_test", "db/db_iter_stress_test.cc", - "serial", + "parallel", [], [], ], [ "db_iter_test", "db/db_iter_test.cc", - "serial", + "parallel", [], [], ], [ "db_iterator_test", "db/db_iterator_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "db_kv_checksum_test", + "db/db_kv_checksum_test.cc", + "parallel", [], [], ], [ "db_log_iter_test", "db/db_log_iter_test.cc", - "serial", + "parallel", [], [], ], [ "db_logical_block_size_cache_test", "db/db_logical_block_size_cache_test.cc", - "serial", + "parallel", [], [], ], [ "db_memtable_test", "db/db_memtable_test.cc", - "serial", + "parallel", [], [], ], [ "db_merge_operand_test", "db/db_merge_operand_test.cc", - "serial", + "parallel", [], [], ], @@ -1277,28 +1403,28 @@ ROCKS_TESTS = [ [ "db_options_test", "db/db_options_test.cc", - "serial", + "parallel", [], [], ], [ "db_properties_test", "db/db_properties_test.cc", - "serial", + "parallel", [], [], ], [ "db_range_del_test", "db/db_range_del_test.cc", - "serial", + "parallel", [], [], ], [ "db_secondary_test", - "db/db_impl/db_secondary_test.cc", - "serial", + "db/db_secondary_test.cc", + "parallel", [], [], ], @@ -1312,21 +1438,21 @@ ROCKS_TESTS = [ [ "db_statistics_test", "db/db_statistics_test.cc", - "serial", + "parallel", [], [], ], [ "db_table_properties_test", "db/db_table_properties_test.cc", - "serial", + "parallel", [], [], ], [ "db_tailing_iter_test", "db/db_tailing_iter_test.cc", - "serial", + "parallel", [], [], ], @@ -1361,42 +1487,49 @@ ROCKS_TESTS = [ [ "db_with_timestamp_basic_test", "db/db_with_timestamp_basic_test.cc", - "serial", + "parallel", [], [], ], [ "db_with_timestamp_compaction_test", "db/db_with_timestamp_compaction_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "db_write_buffer_manager_test", + "db/db_write_buffer_manager_test.cc", + "parallel", [], [], ], [ "db_write_test", "db/db_write_test.cc", - "serial", + "parallel", [], [], ], [ "dbformat_test", "db/dbformat_test.cc", - "serial", + "parallel", [], [], ], [ "defer_test", "util/defer_test.cc", - "serial", + "parallel", [], [], ], [ "delete_scheduler_test", "file/delete_scheduler_test.cc", - "serial", + "parallel", [], [], ], @@ -1410,21 +1543,21 @@ ROCKS_TESTS = [ [ "dynamic_bloom_test", "util/dynamic_bloom_test.cc", - "serial", + "parallel", [], [], ], [ "env_basic_test", "env/env_basic_test.cc", - "serial", + "parallel", [], [], ], [ "env_logger_test", "logging/env_logger_test.cc", - "serial", + "parallel", [], [], ], @@ -1438,28 +1571,28 @@ ROCKS_TESTS = [ [ "env_timed_test", "utilities/env_timed_test.cc", - "serial", + "parallel", [], [], ], [ "error_handler_fs_test", "db/error_handler_fs_test.cc", - "serial", + "parallel", [], [], ], [ "event_logger_test", "logging/event_logger_test.cc", - "serial", + "parallel", [], [], ], [ "external_sst_file_basic_test", "db/external_sst_file_basic_test.cc", - "serial", + "parallel", [], [], ], @@ -1480,7 +1613,7 @@ ROCKS_TESTS = [ [ "file_indexer_test", "db/file_indexer_test.cc", - "serial", + "parallel", [], [], ], @@ -1494,56 +1627,56 @@ ROCKS_TESTS = [ [ "filelock_test", "util/filelock_test.cc", - "serial", + "parallel", [], [], ], [ "filename_test", "db/filename_test.cc", - "serial", + "parallel", [], [], ], [ "flush_job_test", "db/flush_job_test.cc", - "serial", + "parallel", [], [], ], [ "full_filter_block_test", "table/block_based/full_filter_block_test.cc", - "serial", + "parallel", [], [], ], [ "hash_table_test", "utilities/persistent_cache/hash_table_test.cc", - "serial", + "parallel", [], [], ], [ "hash_test", "util/hash_test.cc", - "serial", + "parallel", [], [], ], [ "heap_test", "util/heap_test.cc", - "serial", + "parallel", [], [], ], [ "histogram_test", "monitoring/histogram_test.cc", - "serial", + "parallel", [], [], ], @@ -1564,56 +1697,56 @@ ROCKS_TESTS = [ [ "io_posix_test", "env/io_posix_test.cc", - "serial", + "parallel", [], [], ], [ "io_tracer_parser_test", "tools/io_tracer_parser_test.cc", - "serial", + "parallel", [], [], ], [ "io_tracer_test", "trace_replay/io_tracer_test.cc", - "serial", + "parallel", [], [], ], [ "iostats_context_test", "monitoring/iostats_context_test.cc", - "serial", + "parallel", [], [], ], [ "ldb_cmd_test", "tools/ldb_cmd_test.cc", - "serial", + "parallel", [], [], ], [ "listener_test", "db/listener_test.cc", - "serial", + "parallel", [], [], ], [ "log_test", "db/log_test.cc", - "serial", + "parallel", [], [], ], [ "lru_cache_test", "cache/lru_cache_test.cc", - "serial", + "parallel", [], [], ], @@ -1627,126 +1760,126 @@ ROCKS_TESTS = [ [ "memkind_kmem_allocator_test", "memory/memkind_kmem_allocator_test.cc", - "serial", + "parallel", [], [], ], [ "memory_test", "utilities/memory/memory_test.cc", - "serial", + "parallel", [], [], ], [ "memtable_list_test", "db/memtable_list_test.cc", - "serial", + "parallel", [], [], ], [ "merge_helper_test", "db/merge_helper_test.cc", - "serial", + "parallel", [], [], ], [ "merge_test", "db/merge_test.cc", - "serial", + "parallel", [], [], ], [ "merger_test", "table/merger_test.cc", - "serial", + "parallel", [], [], ], [ "mock_env_test", "env/mock_env_test.cc", - "serial", + "parallel", [], [], ], [ "object_registry_test", "utilities/object_registry_test.cc", - "serial", + "parallel", [], [], ], [ "obsolete_files_test", "db/obsolete_files_test.cc", - "serial", + "parallel", [], [], ], [ "optimistic_transaction_test", "utilities/transactions/optimistic_transaction_test.cc", - "serial", + "parallel", [], [], ], [ "option_change_migration_test", "utilities/option_change_migration/option_change_migration_test.cc", - "serial", + "parallel", [], [], ], [ "options_file_test", "db/options_file_test.cc", - "serial", + "parallel", [], [], ], [ "options_settable_test", "options/options_settable_test.cc", - "serial", + "parallel", [], [], ], [ "options_test", "options/options_test.cc", - "serial", + "parallel", [], [], ], [ "options_util_test", "utilities/options/options_util_test.cc", - "serial", + "parallel", [], [], ], [ "partitioned_filter_block_test", "table/block_based/partitioned_filter_block_test.cc", - "serial", + "parallel", [], [], ], [ "perf_context_test", "db/perf_context_test.cc", - "serial", + "parallel", [], [], ], [ "periodic_work_scheduler_test", "db/periodic_work_scheduler_test.cc", - "serial", + "parallel", [], [], ], @@ -1760,63 +1893,77 @@ ROCKS_TESTS = [ [ "plain_table_db_test", "db/plain_table_db_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "point_lock_manager_test", + "utilities/transactions/lock/point/point_lock_manager_test.cc", + "parallel", [], [], ], [ "prefetch_test", "file/prefetch_test.cc", - "serial", + "parallel", [], [], ], [ "prefix_test", "db/prefix_test.cc", - "serial", + "parallel", [], [], ], [ "random_access_file_reader_test", "file/random_access_file_reader_test.cc", - "serial", + "parallel", [], [], ], [ "random_test", "util/random_test.cc", - "serial", + "parallel", [], [], ], [ "range_del_aggregator_test", "db/range_del_aggregator_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "range_locking_test", + "utilities/transactions/lock/range/range_locking_test.cc", + "parallel", [], [], ], [ "range_tombstone_fragmenter_test", "db/range_tombstone_fragmenter_test.cc", - "serial", + "parallel", [], [], ], [ "rate_limiter_test", "util/rate_limiter_test.cc", - "serial", + "parallel", [], [], ], [ "reduce_levels_test", "tools/reduce_levels_test.cc", - "serial", + "parallel", [], [], ], @@ -1830,84 +1977,91 @@ ROCKS_TESTS = [ [ "repair_test", "db/repair_test.cc", - "serial", + "parallel", [], [], ], [ "repeatable_thread_test", "util/repeatable_thread_test.cc", - "serial", + "parallel", + [], + [], + ], + [ + "ribbon_test", + "util/ribbon_test.cc", + "parallel", [], [], ], [ "sim_cache_test", "utilities/simulator_cache/sim_cache_test.cc", - "serial", + "parallel", [], [], ], [ "skiplist_test", "memtable/skiplist_test.cc", - "serial", + "parallel", [], [], ], [ "slice_test", "util/slice_test.cc", - "serial", + "parallel", [], [], ], [ "slice_transform_test", "util/slice_transform_test.cc", - "serial", + "parallel", [], [], ], [ "sst_dump_test", "tools/sst_dump_test.cc", - "serial", + "parallel", [], [], ], [ "sst_file_reader_test", "table/sst_file_reader_test.cc", - "serial", + "parallel", [], [], ], [ "statistics_test", "monitoring/statistics_test.cc", - "serial", + "parallel", [], [], ], [ "stats_history_test", "monitoring/stats_history_test.cc", - "serial", + "parallel", [], [], ], [ "stringappend_test", "utilities/merge_operators/string_append/stringappend_test.cc", - "serial", + "parallel", [], [], ], [ "table_properties_collector_test", "db/table_properties_collector_test.cc", - "serial", + "parallel", [], [], ], @@ -1921,48 +2075,41 @@ ROCKS_TESTS = [ [ "testutil_test", "test_util/testutil_test.cc", - "serial", + "parallel", [], [], ], [ "thread_list_test", "util/thread_list_test.cc", - "serial", + "parallel", [], [], ], [ "thread_local_test", "util/thread_local_test.cc", - "serial", + "parallel", [], [], ], [ "timer_queue_test", "util/timer_queue_test.cc", - "serial", + "parallel", [], [], ], [ "timer_test", "util/timer_test.cc", - "serial", + "parallel", [], [], ], [ "trace_analyzer_test", "tools/trace_analyzer_test.cc", - "serial", - [], - [], - ], - [ - "transaction_lock_mgr_test", - "utilities/transactions/transaction_lock_mgr_test.cc", "parallel", [], [], @@ -1977,84 +2124,84 @@ ROCKS_TESTS = [ [ "ttl_test", "utilities/ttl/ttl_test.cc", - "serial", + "parallel", [], [], ], [ "util_merge_operators_test", "utilities/util_merge_operators_test.cc", - "serial", + "parallel", [], [], ], [ "version_builder_test", "db/version_builder_test.cc", - "serial", + "parallel", [], [], ], [ "version_edit_test", "db/version_edit_test.cc", - "serial", + "parallel", [], [], ], [ "version_set_test", "db/version_set_test.cc", - "serial", + "parallel", [], [], ], [ "wal_manager_test", "db/wal_manager_test.cc", - "serial", + "parallel", [], [], ], [ "work_queue_test", "util/work_queue_test.cc", - "serial", + "parallel", [], [], ], [ "write_batch_test", "db/write_batch_test.cc", - "serial", + "parallel", [], [], ], [ "write_batch_with_index_test", "utilities/write_batch_with_index/write_batch_with_index_test.cc", - "serial", + "parallel", [], [], ], [ "write_buffer_manager_test", "memtable/write_buffer_manager_test.cc", - "serial", + "parallel", [], [], ], [ "write_callback_test", "db/write_callback_test.cc", - "serial", + "parallel", [], [], ], [ "write_controller_test", "db/write_controller_test.cc", - "serial", + "parallel", [], [], ], @@ -2085,6 +2232,7 @@ ROCKS_TESTS = [ os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [":rocksdb_test_lib"] + extra_deps, external_deps = ROCKSDB_EXTERNAL_DEPS + [ ("googletest", None, "gtest"), diff --git a/USERS.md b/USERS.md index 11d52a0519c..fb612efabd0 100644 --- a/USERS.md +++ b/USERS.md @@ -26,6 +26,9 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu ## Yahoo Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights +## Baidu +[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata. + ## CockroachDB CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach @@ -44,7 +47,7 @@ Tango is using RocksDB as a graph storage to store all users' connection data an Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters. Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf -## Santanader UK/Cloudera Profession Services +## Santander UK/Cloudera Profession Services Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/ ## Airbnb @@ -67,7 +70,7 @@ Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtub [VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed. ## quasardb -[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. +[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. quasardb uses a heavily tuned RocksDB as its persistence layer. ## Netflix @@ -86,7 +89,7 @@ quasardb uses a heavily tuned RocksDB as its persistence layer. [Uber](http://eng.uber.com/cherami/) uses RocksDB as a durable and scalable task queue. ## 360 Pika -[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been widely used in many company +[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been used in many companies. ## LzLabs LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data. @@ -96,16 +99,19 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed ## IOTA Foundation [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things. - + ## Avrio Project [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions. - + ## Crux [Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability. ## Nebula Graph - [Nebula Graph](https://github.com/vesoft-inc/nebula) is a distributed, scalable, lightning-fast, open source graph database capable of hosting super large scale graphs with dozens of billions of vertices (nodes) and trillions of edges, with milliseconds of latency. ## YugabyteDB [YugabyteDB](https://www.yugabyte.com/) is an open source, high performance, distributed SQL database that uses RocksDB as its storage layer. For more information, please see https://github.com/yugabyte/yugabyte-db/. + +## ArangoDB +[ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its sotrage engine. + diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index f0909bc6171..993dd4d1b18 100644 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -24,10 +24,10 @@ # (This generates a TARGET file without user-specified dependency for unit # tests.) # $python3 buckifier/buckify_rocksdb.py \ -# '{"fake": { \ -# "extra_deps": [":test_dep", "//fakes/module:mock1"], \ -# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \ -# } \ +# '{"fake": { +# "extra_deps": [":test_dep", "//fakes/module:mock1"], +# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"] +# } # }' # (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB # unit tests, and will use the extra_compiler_flags to compile the unit test @@ -69,25 +69,25 @@ def get_cc_files(repo_path): return cc_files -# Get parallel tests from Makefile -def get_parallel_tests(repo_path): +# Get non_parallel tests from Makefile +def get_non_parallel_tests(repo_path): Makefile = repo_path + "/Makefile" s = set({}) - found_parallel_tests = False + found_non_parallel_tests = False for line in open(Makefile): line = line.strip() - if line.startswith("PARALLEL_TEST ="): - found_parallel_tests = True - elif found_parallel_tests: + if line.startswith("NON_PARALLEL_TEST ="): + found_non_parallel_tests = True + elif found_non_parallel_tests: if line.endswith("\\"): # remove the trailing \ line = line[:-1] line = line.strip() s.add(line) else: - # we consumed all the parallel tests + # we consumed all the non_parallel tests break return s @@ -123,23 +123,33 @@ def generate_targets(repo_path, deps_map): src_mk = parse_src_mk(repo_path) # get all .cc files cc_files = get_cc_files(repo_path) - # get parallel tests from Makefile - parallel_tests = get_parallel_tests(repo_path) + # get non_parallel tests from Makefile + non_parallel_tests = get_non_parallel_tests(repo_path) - if src_mk is None or cc_files is None or parallel_tests is None: + if src_mk is None or cc_files is None or non_parallel_tests is None: return False - TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path) + extra_argv = "" + if len(sys.argv) >= 2: + # Heuristically quote and canonicalize whitespace for inclusion + # in how the file was generated. + extra_argv = " '{0}'".format(" ".join(sys.argv[1].split())) + + TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv) # rocksdb_lib TARGETS.add_library( "rocksdb_lib", src_mk["LIB_SOURCES"] + + # always add range_tree, it's only excluded on ppc64, which we don't use internally + src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"]) # rocksdb_whole_archive_lib TARGETS.add_library( "rocksdb_whole_archive_lib", src_mk["LIB_SOURCES"] + + # always add range_tree, it's only excluded on ppc64, which we don't use internally + src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"], deps=None, headers=None, @@ -163,6 +173,11 @@ def generate_targets(repo_path, deps_map): src_mk.get("ANALYZER_LIB_SOURCES", []) + ["test_util/testutil.cc"], [":rocksdb_lib"]) + # rocksdb_cache_bench_tools_lib + TARGETS.add_library( + "rocksdb_cache_bench_tools_lib", + src_mk.get("CACHE_BENCH_LIB_SOURCES", []), + [":rocksdb_lib"]) # rocksdb_stress_lib TARGETS.add_rocksdb_library( "rocksdb_stress_lib", @@ -201,7 +216,7 @@ def generate_targets(repo_path, deps_map): TARGETS.register_test( test_target_name, test_src, - test in parallel_tests, + test not in non_parallel_tests, json.dumps(deps['extra_deps']), json.dumps(deps['extra_compiler_flags'])) diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py index e9f6f5be40a..d2649c1311d 100644 --- a/buckifier/targets_builder.py +++ b/buckifier/targets_builder.py @@ -25,10 +25,11 @@ def pretty_list(lst, indent=8): class TARGETSBuilder(object): - def __init__(self, path): + def __init__(self, path, extra_argv): self.path = path self.targets_file = open(path, 'wb') - header = targets_cfg.rocksdb_target_header_template + header = targets_cfg.rocksdb_target_header_template.format( + extra_argv=extra_argv) self.targets_file.write(header.encode("utf-8")) self.total_lib = 0 self.total_bin = 0 @@ -79,26 +80,25 @@ def add_binary(self, name, srcs, deps=None): def add_c_test(self): self.targets_file.write(b""" -if not is_opt_mode: - cpp_binary( - name = "c_test_bin", - srcs = ["db/c_test.c"], - arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, - os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, - compiler_flags = ROCKSDB_COMPILER_FLAGS, - preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, - deps = [":rocksdb_test_lib"], - ) +cpp_binary( + name = "c_test_bin", + srcs = ["db/c_test.c"], + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, + deps = [":rocksdb_test_lib"], +) if not is_opt_mode else None -if not is_opt_mode: - custom_unittest( - "c_test", - command = [ - native.package_name() + "/buckifier/rocks_test_runner.sh", - "$(location :{})".format("c_test_bin"), - ], - type = "simple", - ) +custom_unittest( + name = "c_test", + command = [ + native.package_name() + "/buckifier/rocks_test_runner.sh", + "$(location :{})".format("c_test_bin"), + ], + type = "simple", +) if not is_opt_mode else None """) def register_test(self, diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 0c20ef095c8..38037b250d6 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -5,7 +5,8 @@ from __future__ import unicode_literals rocksdb_target_header_template = \ - """# This file \100generated by `python3 buckifier/buckify_rocksdb.py` + """# This file \100generated by: +#$ python3 buckifier/buckify_rocksdb.py{extra_argv} # --> DO NOT EDIT MANUALLY <-- # This file is a Facebook-specific integration for buck builds, so can # only be validated by Facebook employees. @@ -16,7 +17,7 @@ REPO_PATH = package_name() + "/" -ROCKSDB_COMPILER_FLAGS = [ +ROCKSDB_COMPILER_FLAGS_0 = [ "-fno-builtin-memcmp", # Needed to compile in fbcode "-Wno-expansion-to-defined", @@ -31,10 +32,10 @@ ("zlib", None, "z"), ("gflags", None, "gflags"), ("lz4", None, "lz4"), - ("zstd", None), + ("zstd", None, "zstd"), ] -ROCKSDB_OS_DEPS = [ +ROCKSDB_OS_DEPS_0 = [ ( "linux", ["third-party//numa:numa", "third-party//liburing:uring", "third-party//tbb:tbb"], @@ -45,7 +46,7 @@ ), ] -ROCKSDB_OS_PREPROCESSOR_FLAGS = [ +ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [ ( "linux", [ @@ -93,17 +94,19 @@ # Added missing flags from output of build_detect_platform "-DROCKSDB_BACKTRACE", +] - # Directories with files for #include - "-I" + REPO_PATH + "include/", - "-I" + REPO_PATH, +# Directories with files for #include +ROCKSDB_INCLUDE_PATHS = [ + "", + "include", ] -ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { +ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {{ "x86_64": [ "-DHAVE_PCLMUL", ], -} +}} build_mode = read_config("fbcode", "build_mode") @@ -111,18 +114,18 @@ # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice # doesn't harm and avoid forgetting to add it. -ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else []) +ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else []) sanitizer = read_config("fbcode", "sanitizer") # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( +ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([( "linux", ["-DROCKSDB_JEMALLOC"], )] if sanitizer == "" else []) -ROCKSDB_OS_DEPS += ([( +ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([( "linux", ["third-party//jemalloc:headers"], )] if sanitizer == "" else []) @@ -144,6 +147,7 @@ os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [{deps}], external_deps = ROCKSDB_EXTERNAL_DEPS{extra_external_deps}, link_whole = {link_whole}, @@ -160,6 +164,7 @@ os_deps = ROCKSDB_OS_DEPS, os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = ROCKSDB_LIB_DEPS, external_deps = ROCKSDB_EXTERNAL_DEPS, ) @@ -172,6 +177,7 @@ arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [{deps}], external_deps = ROCKSDB_EXTERNAL_DEPS, ) @@ -202,6 +208,7 @@ os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + include_paths = ROCKSDB_INCLUDE_PATHS, deps = [":rocksdb_test_lib"] + extra_deps, external_deps = ROCKSDB_EXTERNAL_DEPS + [ ("googletest", None, "gtest"), diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index bc8b0dbc769..b8e469e2a9c 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -45,8 +45,13 @@ if test -z "$OUTPUT"; then exit 1 fi -# we depend on C++11 -PLATFORM_CXXFLAGS="-std=c++11" +# we depend on C++11, but should be compatible with newer standards +if [ "$ROCKSDB_CXX_STANDARD" ]; then + PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD" +else + PLATFORM_CXXFLAGS="-std=c++11" +fi + # we currently depend on POSIX platform COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX" @@ -64,10 +69,6 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then source "$PWD/build_tools/fbcode_config_platform007.sh" elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then source "$PWD/build_tools/fbcode_config_platform009.sh" - elif [ -z "$USE_CLANG" ]; then - # Still use platform007 for gcc by default for build break on - # some hosts. - source "$PWD/build_tools/fbcode_config_platform007.sh" else source "$PWD/build_tools/fbcode_config_platform009.sh" fi @@ -170,9 +171,12 @@ case "$TARGET_OS" in PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl" - if test $ROCKSDB_USE_IO_URING; then + if test -z "$ROCKSDB_USE_IO_URING"; then + ROCKSDB_USE_IO_URING=1 + fi + if test "$ROCKSDB_USE_IO_URING" -ne 0; then # check for liburing - $CXX $CFLAGS -x c++ - -luring -o /dev/null 2>/dev/null </dev/null < int main() { struct io_uring ring; @@ -283,7 +287,7 @@ if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then else if ! test $ROCKSDB_DISABLE_FALLOCATE; then # Test whether fallocate is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { @@ -299,7 +303,7 @@ EOF if ! test $ROCKSDB_DISABLE_SNAPPY; then # Test whether Snappy library is installed # http://code.google.com/p/snappy/ - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -314,7 +318,7 @@ EOF # Test whether gflags library is installed # http://gflags.github.io/gflags/ # check if the namespace is gflags - if $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF + if $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF #include using namespace GFLAGS_NAMESPACE; int main() {} @@ -323,7 +327,7 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" # check if namespace is gflags - elif $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF + elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF #include using namespace gflags; int main() {} @@ -332,7 +336,7 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" # check if namespace is google - elif $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF + elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF #include using namespace google; int main() {} @@ -345,7 +349,7 @@ EOF if ! test $ROCKSDB_DISABLE_ZLIB; then # Test whether zlib library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -358,7 +362,7 @@ EOF if ! test $ROCKSDB_DISABLE_BZIP; then # Test whether bzip library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -371,7 +375,7 @@ EOF if ! test $ROCKSDB_DISABLE_LZ4; then # Test whether lz4 library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() {} @@ -385,7 +389,7 @@ EOF if ! test $ROCKSDB_DISABLE_ZSTD; then # Test whether zstd library is installed - $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() {} EOF @@ -398,7 +402,7 @@ EOF if ! test $ROCKSDB_DISABLE_NUMA; then # Test whether numa is available - $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null </dev/null < #include int main() {} @@ -412,7 +416,7 @@ EOF if ! test $ROCKSDB_DISABLE_TBB; then # Test whether tbb is available - $CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null </dev/null < int main() {} EOF @@ -425,7 +429,7 @@ EOF if ! test $ROCKSDB_DISABLE_JEMALLOC; then # Test whether jemalloc is available - if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \ + if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -ljemalloc \ 2>/dev/null; then # This will enable some preprocessor identifiers in the Makefile JEMALLOC=1 @@ -446,7 +450,7 @@ EOF fi if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then # jemalloc is not available. Let's try tcmalloc - if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \ + if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null \ -ltcmalloc 2>/dev/null; then PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc" JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc" @@ -455,7 +459,7 @@ EOF if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then # Test whether malloc_usable_size is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { size_t res = malloc_usable_size(0); @@ -470,7 +474,7 @@ EOF if ! test $ROCKSDB_DISABLE_MEMKIND; then # Test whether memkind library is installed - $CXX $CFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { memkind_malloc(MEMKIND_DAX_KMEM, 1024); @@ -486,7 +490,7 @@ EOF if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { int x = PTHREAD_MUTEX_ADAPTIVE_NP; @@ -501,7 +505,7 @@ EOF if ! test $ROCKSDB_DISABLE_BACKTRACE; then # Test whether backtrace is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { void* frames[1]; @@ -513,7 +517,7 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE" else # Test whether execinfo library is installed - $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { void* frames[1]; @@ -530,7 +534,7 @@ EOF if ! test $ROCKSDB_DISABLE_PG; then # Test if -pg is supported - $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null </dev/null </dev/null </dev/null < int main() { int fd = open("/dev/null", 0); @@ -556,7 +560,7 @@ EOF if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then # Test whether sched_getcpu is supported - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { int cpuid = sched_getcpu(); @@ -570,7 +574,7 @@ EOF if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then # Test whether getauxval is supported - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < int main() { uint64_t auxv = getauxval(AT_HWCAP); @@ -598,7 +602,7 @@ fi # -Wshorten-64-to-32 breaks compilation on FreeBSD i386 if ! [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then # Test whether -Wshorten-64-to-32 is available - $CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null </dev/null </dev/null < + #include + #include + + int main(int argc, char *argv[]) { + printf("GNU libc version: %s\n", gnu_get_libc_version()); + return 0; + } +EOF + if [ "$?" != 0 ]; then + PPC_LIBC_IS_GNU=0 fi fi @@ -835,8 +861,12 @@ echo "CXX=$CXX" >> "$OUTPUT" echo "AR=$AR" >> "$OUTPUT" echo "PLATFORM=$PLATFORM" >> "$OUTPUT" echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT" +echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT" echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT" echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT" +echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT" echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT" echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT" echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT" @@ -870,3 +900,6 @@ echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT" if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT" fi +if test -n "$PPC_LIBC_IS_GNU"; then + echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT" +fi diff --git a/build_tools/fbcode_config_platform009.sh b/build_tools/fbcode_config_platform009.sh index cf24bd81599..82c85b09c1d 100644 --- a/build_tools/fbcode_config_platform009.sh +++ b/build_tools/fbcode_config_platform009.sh @@ -120,7 +120,7 @@ if [ -z "$USE_CLANG" ]; then CXX="$GCC_BASE/bin/g++" AR="$GCC_BASE/bin/gcc-ar" - CFLAGS+=" -B$BINUTILS/gold" + CFLAGS+=" -B$BINUTILS" CFLAGS+=" -isystem $LIBGCC_INCLUDE" CFLAGS+=" -isystem $GLIBC_INCLUDE" JEMALLOC=1 @@ -133,7 +133,7 @@ else KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include" - CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib" + CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib" CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x " CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux " CFLAGS+=" -isystem $GLIBC_INCLUDE" @@ -150,10 +150,11 @@ CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PR CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS" -EXEC_LDFLAGS+=" -B$BINUTILS/gold" +EXEC_LDFLAGS+=" -B$BINUTILS" EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so" EXEC_LDFLAGS+=" $LIBUNWIND" EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib" +EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64" # required by libtbb EXEC_LDFLAGS+=" -ldl" diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh index 386885b578e..c2842dfa493 100755 --- a/build_tools/format-diff.sh +++ b/build_tools/format-diff.sh @@ -52,15 +52,16 @@ else else echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!" echo "You can download clang-format-diff.py by running: " - echo " curl --location http://goo.gl/iUW1u2 -o ${CLANG_FORMAT_DIFF}" + echo " curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py" + echo "You should make sure the downloaded script is not compromised." echo "You can download clang-format by running:" echo " brew install clang-format" echo " Or" echo " apt install clang-format" echo " This might work too:" echo " yum install git-clang-format" - echo "Then, move both files (i.e. ${CLANG_FORMAT_DIFF} and clang-format) to some directory within PATH=${PATH}" - echo "and make sure ${CLANG_FORMAT_DIFF} is executable." + echo "Then make sure clang-format is available and executable from \$PATH:" + echo " clang-format --version" exit 128 fi # Check argparse pre-req on interpreter, or it will fail @@ -75,17 +76,16 @@ else exit 129 fi # Unfortunately, some machines have a Python2 clang-format-diff.py - # installed but only a Python3 interpreter installed. Rather than trying - # different Python versions that might be installed, we can try migrating - # the code to Python3 if it looks like Python2 + # installed but only a Python3 interpreter installed. Unfortunately, + # automatic 2to3 migration is insufficient, so suggest downloading latest. if grep -q "print '" "$CFD_PATH" && \ ${PYTHON:-python3} --version | grep -q 'ython 3'; then - if [ ! -f "$REPO_ROOT/.py3/clang-format-diff.py" ]; then - echo "Migrating $CFD_PATH to Python3 in a hidden file" - mkdir -p "$REPO_ROOT/.py3" - ${PYTHON:-python3} -m lib2to3 -w -n -o "$REPO_ROOT/.py3" "$CFD_PATH" > /dev/null || exit 128 - fi - CFD_PATH="$REPO_ROOT/.py3/clang-format-diff.py" + echo "You have clang-format-diff.py for Python 2 but are using a Python 3" + echo "interpreter (${PYTHON:-python3})." + echo "You can download clang-format-diff.py for Python 3 by running: " + echo " curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py" + echo "You should make sure the downloaded script is not compromised." + exit 130 fi CLANG_FORMAT_DIFF="${PYTHON:-python3} $CFD_PATH" # This had better work after all those checks @@ -136,9 +136,11 @@ then FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)" # Get the differences diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1) + echo "Checking format of changes not yet in $FORMAT_UPSTREAM..." else # Check the format of uncommitted lines, diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) + echo "Checking format of uncommitted changes..." fi if [ -z "$diffs" ] diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index f5ab73a7f6a..42ce511f0a6 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -3,7 +3,7 @@ # to determine next steps to run # Usage: -# EMAIL= ONCALL= TRIGGER= SUBSCRIBER= rocks_ci.py +# EMAIL= ONCALL= TRIGGER= SUBSCRIBER= WORKINGDIR= rocksdb-lego-determinator # # Input Value # ------------------------------------------------------------------------- @@ -11,7 +11,7 @@ # ONCALL Email address to raise a task on failure # TRIGGER Trigger conditions for email. Valid values are fail, warn, all # SUBSCRIBER Email addresss to add as subscriber for task -# +# WORKINGDIR Working directory # # Report configuration @@ -53,13 +53,19 @@ if [[ ! -z $REPORT_EMAIL || ! -z $CREATE_TASK ]]; then ]" fi +# Working directory for the following command, default to current directory +WORKING_DIR=. +if [ ! -z $WORKINGDIR ]; then + WORKING_DIR=$WORKINGDIR +fi + # # Helper variables # CLEANUP_ENV=" { 'name':'Cleanup environment', - 'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true) && make clean', + 'shell':'cd $WORKING_DIR; rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true) && make clean', 'user':'root' }" @@ -100,9 +106,7 @@ NON_SHM="TMPD=/tmp/rocksdb_test_tmp" GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1" ASAN="COMPILE_WITH_ASAN=1" CLANG="USE_CLANG=1" -# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090. -# using platform007 gives us gcc-8 or higher which has that bug fixed. -TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1" +TSAN="COMPILE_WITH_TSAN=1" UBSAN="COMPILE_WITH_UBSAN=1" TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"' NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd" @@ -112,6 +116,7 @@ SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; ex PARSER="'parser':'python build_tools/error_filter.py $1'" CONTRUN_NAME="ROCKSDB_CONTRUN_NAME" +SKIP_FORMAT_CHECKS="SKIP_FORMAT_BUCK_CHECKS=1" # This code is getting called under various scenarios. What we care about is to # understand when it's called from nightly contruns because in that case we'll @@ -153,7 +158,7 @@ UNIT_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug version', - 'shell':'$SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -176,7 +181,7 @@ UNIT_TEST_NON_SHM_COMMANDS="[ { 'name':'Build and test RocksDB debug version', 'timeout': 86400, - 'shell':'$NON_SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $NON_SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -197,7 +202,7 @@ RELEASE_BUILD_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build RocksDB release', - 'shell':'make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -218,7 +223,7 @@ UNIT_TEST_COMMANDS_481="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug version', - 'shell':'$SHM $GCC_481 $DEBUG make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $GCC_481 $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -239,7 +244,7 @@ RELEASE_BUILD_COMMANDS_481="[ $CLEANUP_ENV, { 'name':'Build RocksDB release on GCC 4.8.1', - 'shell':'$GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -260,7 +265,7 @@ CLANG_UNIT_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and test RocksDB debug', - 'shell':'$CLANG $SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $CLANG $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -281,7 +286,7 @@ CLANG_RELEASE_BUILD_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build RocksDB release', - 'shell':'$CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -302,7 +307,7 @@ CLANG_ANALYZE_COMMANDS="[ $CLEANUP_ENV, { 'name':'RocksDB build and analyze', - 'shell':'$CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -323,7 +328,7 @@ CODE_COV_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build, test and collect code coverage info', - 'shell':'$SHM $DEBUG make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -344,7 +349,7 @@ UNITY_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build, test unity test', - 'shell':'$SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -365,7 +370,7 @@ LITE_BUILD_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build RocksDB debug version', - 'shell':'make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SKIP_FORMAT_CHECKS make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -385,7 +390,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[ $CLEANUP_ENV, { 'name':'Report RocksDB Lite binary size to scuba', - 'shell':'tools/report_lite_binary_size.sh', + 'shell':'cd $WORKING_DIR; tools/report_lite_binary_size.sh', 'user':'root', }, ], @@ -404,14 +409,14 @@ STRESS_CRASH_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER }, { 'name':'Build and run RocksDB debug crash tests', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -434,14 +439,14 @@ BLACKBOX_STRESS_CRASH_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER }, { 'name':'Build and run RocksDB debug blackbox crash tests', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 blackbox_crash_test || $CONTRUN_NAME=blackbox_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 blackbox_crash_test || $CONTRUN_NAME=blackbox_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -464,14 +469,14 @@ WHITEBOX_STRESS_CRASH_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER }, { 'name':'Build and run RocksDB debug whitebox crash tests', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 whitebox_crash_test || $CONTRUN_NAME=whitebox_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 whitebox_crash_test || $CONTRUN_NAME=whitebox_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -494,14 +499,14 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER }, { 'name':'Build and run RocksDB debug crash tests with atomic flush', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -524,14 +529,44 @@ STRESS_CRASH_TEST_WITH_TXN_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug stress tests', - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER }, { 'name':'Build and run RocksDB debug crash tests with txn', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + $UPLOAD_DB_DIR, + ], + $REPORT + } +]" + +# +# RocksDB stress/crash test with timestamp +# +STRESS_CRASH_TEST_WITH_TS_COMMANDS="[ + { + 'name':'Rocksdb Stress and Crash Test with ts', + 'oncall':'$ONCALL', + 'executeLocal': 'true', + 'timeout': 86400, + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build and run RocksDB debug stress tests', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL', + 'user':'root', + $PARSER + }, + { + 'name':'Build and run RocksDB debug crash tests with ts', + 'timeout': 86400, + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_ts || $CONTRUN_NAME=crash_test_with_ts $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -553,7 +588,7 @@ WRITE_STRESS_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build and run RocksDB write stress tests', - 'shell':'make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL', 'user':'root', $PARSER } @@ -576,7 +611,7 @@ ASAN_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Test RocksDB debug under ASAN', -'shell':'set -o pipefail && ($SHM $ASAN $DEBUG make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d', +'shell':'cd $WORKING_DIR; set -o pipefail && ($SHM $ASAN $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d', 'user':'root', $PARSER } @@ -599,7 +634,7 @@ ASAN_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug asan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -623,7 +658,7 @@ ASAN_BLACKBOX_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug blackbox asan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 blackbox_asan_crash_test || $CONTRUN_NAME=blackbox_asan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 blackbox_asan_crash_test || $CONTRUN_NAME=blackbox_asan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -647,7 +682,7 @@ ASAN_WHITEBOX_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug whitebox asan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 whitebox_asan_crash_test || $CONTRUN_NAME=whitebox_asan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 whitebox_asan_crash_test || $CONTRUN_NAME=whitebox_asan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -671,7 +706,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Build and run RocksDB debug asan_crash_test_with_atomic_flush', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -695,7 +730,7 @@ ASAN_CRASH_TEST_WITH_TXN_COMMANDS="[ { 'name':'Build and run RocksDB debug asan_crash_test_with_txn', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -717,7 +752,7 @@ UBSAN_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Test RocksDB debug under UBSAN', - 'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $UBSAN $CLANG $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL', 'user':'root', $PARSER } @@ -740,7 +775,7 @@ UBSAN_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug ubsan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -764,7 +799,7 @@ UBSAN_BLACKBOX_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug blackbox ubsan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 blackbox_ubsan_crash_test || $CONTRUN_NAME=blackbox_ubsan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 blackbox_ubsan_crash_test || $CONTRUN_NAME=blackbox_ubsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -788,7 +823,7 @@ UBSAN_WHITEBOX_CRASH_TEST_COMMANDS="[ { 'name':'Build and run RocksDB debug whitebox ubsan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 whitebox_ubsan_crash_test || $CONTRUN_NAME=whitebox_ubsan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 whitebox_ubsan_crash_test || $CONTRUN_NAME=whitebox_ubsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -812,7 +847,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -836,7 +871,7 @@ UBSAN_CRASH_TEST_WITH_TXN_COMMANDS="[ { 'name':'Build and run RocksDB debug ubsan_crash_test_with_txn', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -860,7 +895,7 @@ VALGRIND_TEST_COMMANDS="[ { 'name':'Run RocksDB debug unit tests', 'timeout': 86400, - 'shell':'$SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -883,7 +918,7 @@ TSAN_UNIT_TEST_COMMANDS="[ { 'name':'Run RocksDB debug unit test', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -906,7 +941,7 @@ TSAN_CRASH_TEST_COMMANDS="[ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -930,7 +965,7 @@ TSAN_BLACKBOX_CRASH_TEST_COMMANDS="[ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 blackbox_crash_test || $CONTRUN_NAME=tsan_blackbox_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 blackbox_crash_test || $CONTRUN_NAME=tsan_blackbox_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -954,7 +989,7 @@ TSAN_WHITEBOX_CRASH_TEST_COMMANDS="[ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 whitebox_crash_test || $CONTRUN_NAME=tsan_whitebox_crash_test $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 whitebox_crash_test || $CONTRUN_NAME=tsan_whitebox_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -978,7 +1013,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -1002,7 +1037,7 @@ TSAN_CRASH_TEST_WITH_TXN_COMMANDS="[ { 'name':'Compile and run', 'timeout': 86400, - 'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -1036,7 +1071,7 @@ FORMAT_COMPATIBLE_COMMANDS="[ $CLEANUP_ENV, { 'name':'Run RocksDB debug unit test', - 'shell':'build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -1058,6 +1093,7 @@ run_no_compression() mv .tmp.fbcode_config.sh build_tools/fbcode_config.sh cat Makefile | grep -v tools/ldb_test.py > .tmp.Makefile mv .tmp.Makefile Makefile + export $SKIP_FORMAT_CHECKS make $DEBUG J=1 check } @@ -1070,7 +1106,7 @@ NO_COMPRESSION_COMMANDS="[ $CLEANUP_ENV, { 'name':'Run RocksDB debug unit test', - 'shell':'build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -1129,7 +1165,7 @@ REGRESSION_COMMANDS="[ $CLEANUP_ENV, { 'name':'Make and run script', - 'shell':'build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -1150,7 +1186,7 @@ JAVA_BUILD_TEST_COMMANDS="[ $CLEANUP_ENV, { 'name':'Build RocksDB for Java', - 'shell':'$SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL', + 'shell':'cd $WORKING_DIR; $SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -1212,6 +1248,9 @@ case $1 in stress_crash_with_txn) echo $STRESS_CRASH_TEST_WITH_TXN_COMMANDS ;; + stress_crash_with_ts) + echo $STRESS_CRASH_TEST_WITH_TS_COMMANDS + ;; write_stress) echo $WRITE_STRESS_COMMANDS ;; diff --git a/build_tools/run_ci_db_test.ps1 b/build_tools/run_ci_db_test.ps1 index 883d4e2a5c6..9aea51708cb 100644 --- a/build_tools/run_ci_db_test.ps1 +++ b/build_tools/run_ci_db_test.ps1 @@ -68,7 +68,7 @@ $BinariesFolder = -Join($RootFolder, "\build\Debug\") if($WorkFolder -eq "") { - # If TEST_TMPDIR is set use it + # If TEST_TMPDIR is set use it [string]$var = $Env:TEST_TMPDIR if($var -eq "") { $WorkFolder = -Join($RootFolder, "\db_tests\") @@ -93,7 +93,7 @@ $ExcludeCasesSet = New-Object System.Collections.Generic.HashSet[string] if($ExcludeCases -ne "") { Write-Host "ExcludeCases: $ExcludeCases" $l = $ExcludeCases -split ' ' - ForEach($t in $l) { + ForEach($t in $l) { $ExcludeCasesSet.Add($t) | Out-Null } } @@ -102,7 +102,7 @@ $ExcludeExesSet = New-Object System.Collections.Generic.HashSet[string] if($ExcludeExes -ne "") { Write-Host "ExcludeExe: $ExcludeExes" $l = $ExcludeExes -split ' ' - ForEach($t in $l) { + ForEach($t in $l) { $ExcludeExesSet.Add($t) | Out-Null } } @@ -118,6 +118,10 @@ if($ExcludeExes -ne "") { # MultiThreaded/MultiThreadedDBTest. # MultiThreaded/0 # GetParam() = 0 # MultiThreaded/1 # GetParam() = 1 +# RibbonTypeParamTest/0. # TypeParam = struct DefaultTypesAndSettings +# CompactnessAndBacktrackAndFpRate +# Extremes +# FindOccupancyForSuccessRate # # into this: # @@ -125,6 +129,9 @@ if($ExcludeExes -ne "") { # DBTest.WriteEmptyBatch # MultiThreaded/MultiThreadedDBTest.MultiThreaded/0 # MultiThreaded/MultiThreadedDBTest.MultiThreaded/1 +# RibbonTypeParamTest/0.CompactnessAndBacktrackAndFpRate +# RibbonTypeParamTest/0.Extremes +# RibbonTypeParamTest/0.FindOccupancyForSuccessRate # # Output into the parameter in a form TestName -> Log File Name function ExtractTestCases([string]$GTestExe, $HashTable) { @@ -138,6 +145,8 @@ function ExtractTestCases([string]$GTestExe, $HashTable) { ForEach( $l in $Tests) { + # remove trailing comment if any + $l = $l -replace '\s+\#.*','' # Leading whitespace is fine $l = $l -replace '^\s+','' # Trailing dot is a test group but no whitespace @@ -146,8 +155,7 @@ function ExtractTestCases([string]$GTestExe, $HashTable) { } else { # Otherwise it is a test name, remove leading space $test = $l - # remove trailing comment if any and create a log name - $test = $test -replace '\s+\#.*','' + # create a log name $test = "$Group$test" if($ExcludeCasesSet.Contains($test)) { @@ -253,7 +261,7 @@ if($Run -ne "") { $DiscoveredExe = @() dir -Path $search_path | ForEach-Object { - $DiscoveredExe += ($_.Name) + $DiscoveredExe += ($_.Name) } # Remove exclusions @@ -293,7 +301,7 @@ if($SuiteRun -ne "") { $ListOfExe = @() dir -Path $search_path | ForEach-Object { - $ListOfExe += ($_.Name) + $ListOfExe += ($_.Name) } # Exclude those in RunOnly from running as suites @@ -348,7 +356,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal) # Wait for all to finish and get the results while(($JobToLog.Count -gt 0) -or - ($TestCmds.Count -gt 0) -or + ($TestCmds.Count -gt 0) -or ($Suites.Count -gt 0)) { # Make sure we have maximum concurrent jobs running if anything @@ -468,8 +476,8 @@ RunJobs -Suites $CasesToRun -TestCmds $TestExes -ConcurrencyVal $Concurrency $EndDate = (Get-Date) -New-TimeSpan -Start $StartDate -End $EndDate | - ForEach-Object { +New-TimeSpan -Start $StartDate -End $EndDate | + ForEach-Object { "Elapsed time: {0:g}" -f $_ } @@ -484,4 +492,4 @@ if(!$script:success) { exit 0 - + diff --git a/cache/cache.cc b/cache/cache.cc index 78897c41614..4eef1c2d61e 100644 --- a/cache/cache.cc +++ b/cache/cache.cc @@ -44,9 +44,9 @@ Status Cache::CreateFromString(const ConfigOptions& config_options, } else { #ifndef ROCKSDB_LITE LRUCacheOptions cache_opts; - status = OptionTypeInfo::ParseStruct( - config_options, "", &lru_cache_options_type_info, "", value, - reinterpret_cast(&cache_opts)); + status = OptionTypeInfo::ParseStruct(config_options, "", + &lru_cache_options_type_info, "", + value, &cache_opts); if (status.ok()) { cache = NewLRUCache(cache_opts); } diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc index 7d246759694..0669354ae19 100644 --- a/cache/cache_bench.cc +++ b/cache/cache_bench.cc @@ -1,8 +1,11 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef GFLAGS #include int main() { @@ -10,372 +13,8 @@ int main() { return 1; } #else - -#include -#include -#include -#include - -#include "port/port.h" -#include "rocksdb/cache.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "util/coding.h" -#include "util/gflags_compat.h" -#include "util/hash.h" -#include "util/mutexlock.h" -#include "util/random.h" - -using GFLAGS_NAMESPACE::ParseCommandLineFlags; - -static constexpr uint32_t KiB = uint32_t{1} << 10; -static constexpr uint32_t MiB = KiB << 10; -static constexpr uint64_t GiB = MiB << 10; - -DEFINE_uint32(threads, 16, "Number of concurrent threads to run."); -DEFINE_uint64(cache_size, 1 * GiB, - "Number of bytes to use as a cache of uncompressed data."); -DEFINE_uint32(num_shard_bits, 6, "shard_bits."); - -DEFINE_double(resident_ratio, 0.25, - "Ratio of keys fitting in cache to keyspace."); -DEFINE_uint64(ops_per_thread, 0, - "Number of operations per thread. (Default: 5 * keyspace size)"); -DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); - -DEFINE_uint32(skew, 5, "Degree of skew in key selection"); -DEFINE_bool(populate_cache, true, "Populate cache before operations"); - -DEFINE_uint32(lookup_insert_percent, 87, - "Ratio of lookup (+ insert on not found) to total workload " - "(expressed as a percentage)"); -DEFINE_uint32(insert_percent, 2, - "Ratio of insert to total workload (expressed as a percentage)"); -DEFINE_uint32(lookup_percent, 10, - "Ratio of lookup to total workload (expressed as a percentage)"); -DEFINE_uint32(erase_percent, 1, - "Ratio of erase to total workload (expressed as a percentage)"); - -DEFINE_bool(use_clock_cache, false, ""); - -namespace ROCKSDB_NAMESPACE { - -class CacheBench; -namespace { -// State shared by all concurrent executions of the same benchmark. -class SharedState { - public: - explicit SharedState(CacheBench* cache_bench) - : cv_(&mu_), - num_initialized_(0), - start_(false), - num_done_(0), - cache_bench_(cache_bench) {} - - ~SharedState() {} - - port::Mutex* GetMutex() { - return &mu_; - } - - port::CondVar* GetCondVar() { - return &cv_; - } - - CacheBench* GetCacheBench() const { - return cache_bench_; - } - - void IncInitialized() { - num_initialized_++; - } - - void IncDone() { - num_done_++; - } - - bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; } - - bool AllDone() const { return num_done_ >= FLAGS_threads; } - - void SetStart() { - start_ = true; - } - - bool Started() const { - return start_; - } - - private: - port::Mutex mu_; - port::CondVar cv_; - - uint64_t num_initialized_; - bool start_; - uint64_t num_done_; - - CacheBench* cache_bench_; -}; - -// Per-thread state for concurrent executions of the same benchmark. -struct ThreadState { - uint32_t tid; - Random64 rnd; - SharedState* shared; - - ThreadState(uint32_t index, SharedState* _shared) - : tid(index), rnd(1000 + index), shared(_shared) {} -}; - -struct KeyGen { - char key_data[27]; - - Slice GetRand(Random64& rnd, uint64_t max_key) { - uint64_t raw = rnd.Next(); - // Skew according to setting - for (uint32_t i = 0; i < FLAGS_skew; ++i) { - raw = std::min(raw, rnd.Next()); - } - uint64_t key = FastRange64(raw, max_key); - // Variable size and alignment - size_t off = key % 8; - key_data[0] = char{42}; - EncodeFixed64(key_data + 1, key); - key_data[9] = char{11}; - EncodeFixed64(key_data + 10, key); - key_data[18] = char{4}; - EncodeFixed64(key_data + 19, key); - return Slice(&key_data[off], sizeof(key_data) - off); - } -}; - -char* createValue(Random64& rnd) { - char* rv = new char[FLAGS_value_bytes]; - // Fill with some filler data, and take some CPU time - for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { - EncodeFixed64(rv + i, rnd.Next()); - } - return rv; -} - -void deleter(const Slice& /*key*/, void* value) { - delete[] static_cast(value); -} -} // namespace - -class CacheBench { - static constexpr uint64_t kHundredthUint64 = - std::numeric_limits::max() / 100U; - - public: - CacheBench() - : max_key_(static_cast(FLAGS_cache_size / FLAGS_resident_ratio / - FLAGS_value_bytes)), - lookup_insert_threshold_(kHundredthUint64 * - FLAGS_lookup_insert_percent), - insert_threshold_(lookup_insert_threshold_ + - kHundredthUint64 * FLAGS_insert_percent), - lookup_threshold_(insert_threshold_ + - kHundredthUint64 * FLAGS_lookup_percent), - erase_threshold_(lookup_threshold_ + - kHundredthUint64 * FLAGS_erase_percent) { - if (erase_threshold_ != 100U * kHundredthUint64) { - fprintf(stderr, "Percentages must add to 100.\n"); - exit(1); - } - if (FLAGS_use_clock_cache) { - cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits); - if (!cache_) { - fprintf(stderr, "Clock cache not supported.\n"); - exit(1); - } - } else { - cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits); - } - if (FLAGS_ops_per_thread == 0) { - FLAGS_ops_per_thread = 5 * max_key_; - } - } - - ~CacheBench() {} - - void PopulateCache() { - Random64 rnd(1); - KeyGen keygen; - for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { - cache_->Insert(keygen.GetRand(rnd, max_key_), createValue(rnd), - FLAGS_value_bytes, &deleter); - } - } - - bool Run() { - ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default(); - - PrintEnv(); - SharedState shared(this); - std::vector > threads(FLAGS_threads); - for (uint32_t i = 0; i < FLAGS_threads; i++) { - threads[i].reset(new ThreadState(i, &shared)); - env->StartThread(ThreadBody, threads[i].get()); - } - { - MutexLock l(shared.GetMutex()); - while (!shared.AllInitialized()) { - shared.GetCondVar()->Wait(); - } - // Record start time - uint64_t start_time = env->NowMicros(); - - // Start all threads - shared.SetStart(); - shared.GetCondVar()->SignalAll(); - - // Wait threads to complete - while (!shared.AllDone()) { - shared.GetCondVar()->Wait(); - } - - // Record end time - uint64_t end_time = env->NowMicros(); - double elapsed = static_cast(end_time - start_time) * 1e-6; - uint32_t qps = static_cast( - static_cast(FLAGS_threads * FLAGS_ops_per_thread) / elapsed); - fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps); - } - return true; - } - - private: - std::shared_ptr cache_; - const uint64_t max_key_; - // Cumulative thresholds in the space of a random uint64_t - const uint64_t lookup_insert_threshold_; - const uint64_t insert_threshold_; - const uint64_t lookup_threshold_; - const uint64_t erase_threshold_; - - static void ThreadBody(void* v) { - ThreadState* thread = static_cast(v); - SharedState* shared = thread->shared; - - { - MutexLock l(shared->GetMutex()); - shared->IncInitialized(); - if (shared->AllInitialized()) { - shared->GetCondVar()->SignalAll(); - } - while (!shared->Started()) { - shared->GetCondVar()->Wait(); - } - } - thread->shared->GetCacheBench()->OperateCache(thread); - - { - MutexLock l(shared->GetMutex()); - shared->IncDone(); - if (shared->AllDone()) { - shared->GetCondVar()->SignalAll(); - } - } - } - - void OperateCache(ThreadState* thread) { - // To use looked-up values - uint64_t result = 0; - // To hold handles for a non-trivial amount of time - Cache::Handle* handle = nullptr; - KeyGen gen; - for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { - Slice key = gen.GetRand(thread->rnd, max_key_); - uint64_t random_op = thread->rnd.Next(); - if (random_op < lookup_insert_threshold_) { - if (handle) { - cache_->Release(handle); - handle = nullptr; - } - // do lookup - handle = cache_->Lookup(key); - if (handle) { - // do something with the data - result += NPHash64(static_cast(cache_->Value(handle)), - FLAGS_value_bytes); - } else { - // do insert - cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes, - &deleter, &handle); - } - } else if (random_op < insert_threshold_) { - if (handle) { - cache_->Release(handle); - handle = nullptr; - } - // do insert - cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes, - &deleter, &handle); - } else if (random_op < lookup_threshold_) { - if (handle) { - cache_->Release(handle); - handle = nullptr; - } - // do lookup - handle = cache_->Lookup(key); - if (handle) { - // do something with the data - result += NPHash64(static_cast(cache_->Value(handle)), - FLAGS_value_bytes); - } - } else if (random_op < erase_threshold_) { - // do erase - cache_->Erase(key); - } else { - // Should be extremely unlikely (noop) - assert(random_op >= kHundredthUint64 * 100U); - } - } - if (handle) { - cache_->Release(handle); - handle = nullptr; - } - } - - void PrintEnv() const { - printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); - printf("Number of threads : %u\n", FLAGS_threads); - printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); - printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size); - printf("Num shard bits : %u\n", FLAGS_num_shard_bits); - printf("Max key : %" PRIu64 "\n", max_key_); - printf("Resident ratio : %g\n", FLAGS_resident_ratio); - printf("Skew degree : %u\n", FLAGS_skew); - printf("Populate cache : %d\n", int{FLAGS_populate_cache}); - printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent); - printf("Insert percentage : %u%%\n", FLAGS_insert_percent); - printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent); - printf("Erase percentage : %u%%\n", FLAGS_erase_percent); - printf("----------------------------\n"); - } -}; -} // namespace ROCKSDB_NAMESPACE - +#include int main(int argc, char** argv) { - ParseCommandLineFlags(&argc, &argv, true); - - if (FLAGS_threads <= 0) { - fprintf(stderr, "threads number <= 0\n"); - exit(1); - } - - ROCKSDB_NAMESPACE::CacheBench bench; - if (FLAGS_populate_cache) { - bench.PopulateCache(); - printf("Population complete\n"); - printf("----------------------------\n"); - } - if (bench.Run()) { - return 0; - } else { - return 1; - } + return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv); } - #endif // GFLAGS diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc new file mode 100644 index 00000000000..9fadf85a90b --- /dev/null +++ b/cache/cache_bench_tool.cc @@ -0,0 +1,573 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS +#include +#include +#include +#include +#include + +#include "monitoring/histogram.h" +#include "port/port.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/secondary_cache.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/utilities/object_registry.h" +#include "table/block_based/cachable_entry.h" +#include "util/coding.h" +#include "util/gflags_compat.h" +#include "util/hash.h" +#include "util/mutexlock.h" +#include "util/random.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +static constexpr uint32_t KiB = uint32_t{1} << 10; +static constexpr uint32_t MiB = KiB << 10; +static constexpr uint64_t GiB = MiB << 10; + +DEFINE_uint32(threads, 16, "Number of concurrent threads to run."); +DEFINE_uint64(cache_size, 1 * GiB, + "Number of bytes to use as a cache of uncompressed data."); +DEFINE_uint32(num_shard_bits, 6, "shard_bits."); + +DEFINE_double(resident_ratio, 0.25, + "Ratio of keys fitting in cache to keyspace."); +DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread."); +DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); + +DEFINE_uint32(skew, 5, "Degree of skew in key selection"); +DEFINE_bool(populate_cache, true, "Populate cache before operations"); + +DEFINE_uint32(lookup_insert_percent, 87, + "Ratio of lookup (+ insert on not found) to total workload " + "(expressed as a percentage)"); +DEFINE_uint32(insert_percent, 2, + "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_uint32(lookup_percent, 10, + "Ratio of lookup to total workload (expressed as a percentage)"); +DEFINE_uint32(erase_percent, 1, + "Ratio of erase to total workload (expressed as a percentage)"); +DEFINE_bool(gather_stats, false, + "Whether to periodically simulate gathering block cache stats, " + "using one more thread."); +DEFINE_uint32( + gather_stats_sleep_ms, 1000, + "How many milliseconds to sleep between each gathering of stats."); + +DEFINE_uint32(gather_stats_entries_per_lock, 256, + "For Cache::ApplyToAllEntries"); +DEFINE_bool(skewed, false, "If true, skew the key access distribution"); +#ifndef ROCKSDB_LITE +DEFINE_string(secondary_cache_uri, "", + "Full URI for creating a custom secondary cache object"); +static class std::shared_ptr secondary_cache; +#endif // ROCKSDB_LITE + +DEFINE_bool(use_clock_cache, false, ""); + +namespace ROCKSDB_NAMESPACE { + +class CacheBench; +namespace { +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + explicit SharedState(CacheBench* cache_bench) + : cv_(&mu_), + num_initialized_(0), + start_(false), + num_done_(0), + cache_bench_(cache_bench) {} + + ~SharedState() {} + + port::Mutex* GetMutex() { return &mu_; } + + port::CondVar* GetCondVar() { return &cv_; } + + CacheBench* GetCacheBench() const { return cache_bench_; } + + void IncInitialized() { num_initialized_++; } + + void IncDone() { num_done_++; } + + bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; } + + bool AllDone() const { return num_done_ >= FLAGS_threads; } + + void SetStart() { start_ = true; } + + bool Started() const { return start_; } + + private: + port::Mutex mu_; + port::CondVar cv_; + + uint64_t num_initialized_; + bool start_; + uint64_t num_done_; + + CacheBench* cache_bench_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; + Random64 rnd; + SharedState* shared; + HistogramImpl latency_ns_hist; + uint64_t duration_us = 0; + + ThreadState(uint32_t index, SharedState* _shared) + : tid(index), rnd(1000 + index), shared(_shared) {} +}; + +struct KeyGen { + char key_data[27]; + + Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) { + uint64_t key = 0; + if (!FLAGS_skewed) { + uint64_t raw = rnd.Next(); + // Skew according to setting + for (uint32_t i = 0; i < FLAGS_skew; ++i) { + raw = std::min(raw, rnd.Next()); + } + key = FastRange64(raw, max_key); + } else { + key = rnd.Skewed(max_log); + if (key > max_key) { + key -= max_key; + } + } + // Variable size and alignment + size_t off = key % 8; + key_data[0] = char{42}; + EncodeFixed64(key_data + 1, key); + key_data[9] = char{11}; + EncodeFixed64(key_data + 10, key); + key_data[18] = char{4}; + EncodeFixed64(key_data + 19, key); + return Slice(&key_data[off], sizeof(key_data) - off); + } +}; + +char* createValue(Random64& rnd) { + char* rv = new char[FLAGS_value_bytes]; + // Fill with some filler data, and take some CPU time + for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { + EncodeFixed64(rv + i, rnd.Next()); + } + return rv; +} + +// Callbacks for secondary cache +size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; } + +Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) { + memcpy(out, obj, size); + return Status::OK(); +} + +// Different deleters to simulate using deleter to gather +// stats on the code origin and kind of cache entries. +void deleter1(const Slice& /*key*/, void* value) { + delete[] static_cast(value); +} +void deleter2(const Slice& /*key*/, void* value) { + delete[] static_cast(value); +} +void deleter3(const Slice& /*key*/, void* value) { + delete[] static_cast(value); +} + +Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1); +Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2); +Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3); +} // namespace + +class CacheBench { + static constexpr uint64_t kHundredthUint64 = + std::numeric_limits::max() / 100U; + + public: + CacheBench() + : max_key_(static_cast(FLAGS_cache_size / FLAGS_resident_ratio / + FLAGS_value_bytes)), + lookup_insert_threshold_(kHundredthUint64 * + FLAGS_lookup_insert_percent), + insert_threshold_(lookup_insert_threshold_ + + kHundredthUint64 * FLAGS_insert_percent), + lookup_threshold_(insert_threshold_ + + kHundredthUint64 * FLAGS_lookup_percent), + erase_threshold_(lookup_threshold_ + + kHundredthUint64 * FLAGS_erase_percent), + skewed_(FLAGS_skewed) { + if (erase_threshold_ != 100U * kHundredthUint64) { + fprintf(stderr, "Percentages must add to 100.\n"); + exit(1); + } + + max_log_ = 0; + if (skewed_) { + uint64_t max_key = max_key_; + while (max_key >>= 1) max_log_++; + if (max_key > (1u << max_log_)) max_log_++; + } + + if (FLAGS_use_clock_cache) { + cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits); + if (!cache_) { + fprintf(stderr, "Clock cache not supported.\n"); + exit(1); + } + } else { + LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5); +#ifndef ROCKSDB_LITE + if (!FLAGS_secondary_cache_uri.empty()) { + Status s = + ObjectRegistry::NewInstance()->NewSharedObject( + FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf( + stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + opts.secondary_cache = secondary_cache; + } +#endif // ROCKSDB_LITE + + cache_ = NewLRUCache(opts); + } + } + + ~CacheBench() {} + + void PopulateCache() { + Random64 rnd(1); + KeyGen keygen; + for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { + cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), createValue(rnd), + &helper1, FLAGS_value_bytes); + } + } + + bool Run() { + const auto clock = SystemClock::Default().get(); + + PrintEnv(); + SharedState shared(this); + std::vector > threads(FLAGS_threads); + for (uint32_t i = 0; i < FLAGS_threads; i++) { + threads[i].reset(new ThreadState(i, &shared)); + std::thread(ThreadBody, threads[i].get()).detach(); + } + + HistogramImpl stats_hist; + std::string stats_report; + std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report); + + uint64_t start_time; + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + // Record start time + start_time = clock->NowMicros(); + + // Start all threads + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + + // Wait threads to complete + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + } + + // Stats gathering is considered background work. This time measurement + // is for foreground work, and not really ideal for that. See below. + uint64_t end_time = clock->NowMicros(); + stats_thread.join(); + + // Wall clock time - includes idle time if threads + // finish at different times (not ideal). + double elapsed_secs = static_cast(end_time - start_time) * 1e-6; + uint32_t ops_per_sec = static_cast( + 1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs); + printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs, + ops_per_sec); + + // Total time in each thread (more accurate throughput measure) + elapsed_secs = 0; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + elapsed_secs += threads[i]->duration_us * 1e-6; + } + ops_per_sec = static_cast(1.0 * FLAGS_threads * + FLAGS_ops_per_thread / elapsed_secs); + printf("Thread ops/sec = %u\n", ops_per_sec); + + printf("\nOperation latency (ns):\n"); + HistogramImpl combined; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + combined.Merge(threads[i]->latency_ns_hist); + } + printf("%s", combined.ToString().c_str()); + + if (FLAGS_gather_stats) { + printf("\nGather stats latency (us):\n"); + printf("%s", stats_hist.ToString().c_str()); + } + + printf("\n%s", stats_report.c_str()); + + return true; + } + + private: + std::shared_ptr cache_; + const uint64_t max_key_; + // Cumulative thresholds in the space of a random uint64_t + const uint64_t lookup_insert_threshold_; + const uint64_t insert_threshold_; + const uint64_t lookup_threshold_; + const uint64_t erase_threshold_; + const bool skewed_; + int max_log_; + + // A benchmark version of gathering stats on an active block cache by + // iterating over it. The primary purpose is to measure the impact of + // gathering stats with ApplyToAllEntries on throughput- and + // latency-sensitive Cache users. Performance of stats gathering is + // also reported. The last set of gathered stats is also reported, for + // manual sanity checking for logical errors or other unexpected + // behavior of cache_bench or the underlying Cache. + static void StatsBody(SharedState* shared, HistogramImpl* stats_hist, + std::string* stats_report) { + if (!FLAGS_gather_stats) { + return; + } + const auto clock = SystemClock::Default().get(); + uint64_t total_key_size = 0; + uint64_t total_charge = 0; + uint64_t total_entry_count = 0; + std::set deleters; + StopWatchNano timer(clock); + + for (;;) { + uint64_t time; + time = clock->NowMicros(); + uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000; + + { + MutexLock l(shared->GetMutex()); + for (;;) { + if (shared->AllDone()) { + std::ostringstream ostr; + ostr << "Most recent cache entry stats:\n" + << "Number of entries: " << total_entry_count << "\n" + << "Total charge: " << BytesToHumanString(total_charge) << "\n" + << "Average key size: " + << (1.0 * total_key_size / total_entry_count) << "\n" + << "Average charge: " + << BytesToHumanString(1.0 * total_charge / total_entry_count) + << "\n" + << "Unique deleters: " << deleters.size() << "\n"; + *stats_report = ostr.str(); + return; + } + if (clock->NowMicros() >= deadline) { + break; + } + uint64_t diff = deadline - std::min(clock->NowMicros(), deadline); + shared->GetCondVar()->TimedWait(diff + 1); + } + } + + // Now gather stats, outside of mutex + total_key_size = 0; + total_charge = 0; + total_entry_count = 0; + deleters.clear(); + auto fn = [&](const Slice& key, void* /*value*/, size_t charge, + Cache::DeleterFn deleter) { + total_key_size += key.size(); + total_charge += charge; + ++total_entry_count; + // Something slightly more expensive as in (future) stats by category + deleters.insert(deleter); + }; + timer.Start(); + Cache::ApplyToAllEntriesOptions opts; + opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; + shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); + stats_hist->Add(timer.ElapsedNanos() / 1000); + } + } + + static void ThreadBody(ThreadState* thread) { + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetCacheBench()->OperateCache(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + } + + void OperateCache(ThreadState* thread) { + // To use looked-up values + uint64_t result = 0; + // To hold handles for a non-trivial amount of time + Cache::Handle* handle = nullptr; + KeyGen gen; + const auto clock = SystemClock::Default().get(); + uint64_t start_time = clock->NowMicros(); + StopWatchNano timer(clock); + + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + timer.Start(); + Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); + uint64_t random_op = thread->rnd.Next(); + Cache::CreateCallback create_cb = + [](void* buf, size_t size, void** out_obj, size_t* charge) -> Status { + *out_obj = reinterpret_cast(new char[size]); + memcpy(*out_obj, buf, size); + *charge = size; + return Status::OK(); + }; + + if (random_op < lookup_insert_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do lookup + handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, + true); + if (handle) { + // do something with the data + result += NPHash64(static_cast(cache_->Value(handle)), + FLAGS_value_bytes); + } else { + // do insert + cache_->Insert(key, createValue(thread->rnd), &helper2, + FLAGS_value_bytes, &handle); + } + } else if (random_op < insert_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do insert + cache_->Insert(key, createValue(thread->rnd), &helper3, + FLAGS_value_bytes, &handle); + } else if (random_op < lookup_threshold_) { + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // do lookup + handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, + true); + if (handle) { + // do something with the data + result += NPHash64(static_cast(cache_->Value(handle)), + FLAGS_value_bytes); + } + } else if (random_op < erase_threshold_) { + // do erase + cache_->Erase(key); + } else { + // Should be extremely unlikely (noop) + assert(random_op >= kHundredthUint64 * 100U); + } + thread->latency_ns_hist.Add(timer.ElapsedNanos()); + } + if (handle) { + cache_->Release(handle); + handle = nullptr; + } + // Ensure computations on `result` are not optimized away. + if (result == 1) { + printf("You are extremely unlucky(2). Try again.\n"); + exit(1); + } + thread->duration_us = clock->NowMicros() - start_time; + } + + void PrintEnv() const { + printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Number of threads : %u\n", FLAGS_threads); + printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); + printf("Cache size : %s\n", + BytesToHumanString(FLAGS_cache_size).c_str()); + printf("Num shard bits : %u\n", FLAGS_num_shard_bits); + printf("Max key : %" PRIu64 "\n", max_key_); + printf("Resident ratio : %g\n", FLAGS_resident_ratio); + printf("Skew degree : %u\n", FLAGS_skew); + printf("Populate cache : %d\n", int{FLAGS_populate_cache}); + printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent); + printf("Insert percentage : %u%%\n", FLAGS_insert_percent); + printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent); + printf("Erase percentage : %u%%\n", FLAGS_erase_percent); + std::ostringstream stats; + if (FLAGS_gather_stats) { + stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, " + << FLAGS_gather_stats_entries_per_lock << "/lock)"; + } else { + stats << "disabled"; + } + printf("Gather stats : %s\n", stats.str().c_str()); + printf("----------------------------\n"); + } +}; + +int cache_bench_tool(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_threads <= 0) { + fprintf(stderr, "threads number <= 0\n"); + exit(1); + } + + ROCKSDB_NAMESPACE::CacheBench bench; + if (FLAGS_populate_cache) { + bench.PopulateCache(); + printf("Population complete\n"); + printf("----------------------------\n"); + } + if (bench.Run()) { + return 0; + } else { + return 1; + } +} // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/cache/cache_entry_roles.cc b/cache/cache_entry_roles.cc new file mode 100644 index 00000000000..dbc71206398 --- /dev/null +++ b/cache/cache_entry_roles.cc @@ -0,0 +1,66 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_entry_roles.h" + +#include + +#include "port/lang.h" + +namespace ROCKSDB_NAMESPACE { + +std::array kCacheEntryRoleToCamelString{{ + "DataBlock", + "FilterBlock", + "FilterMetaBlock", + "DeprecatedFilterBlock", + "IndexBlock", + "OtherBlock", + "WriteBuffer", + "Misc", +}}; + +std::array kCacheEntryRoleToHyphenString{{ + "data-block", + "filter-block", + "filter-meta-block", + "deprecated-filter-block", + "index-block", + "other-block", + "write-buffer", + "misc", +}}; + +namespace { + +struct Registry { + std::mutex mutex; + std::unordered_map role_map; + void Register(Cache::DeleterFn fn, CacheEntryRole role) { + std::lock_guard lock(mutex); + role_map[fn] = role; + } + std::unordered_map Copy() { + std::lock_guard lock(mutex); + return role_map; + } +}; + +Registry& GetRegistry() { + STATIC_AVOID_DESTRUCTION(Registry, registry); + return registry; +} + +} // namespace + +void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) { + GetRegistry().Register(fn, role); +} + +std::unordered_map CopyCacheDeleterRoleMap() { + return GetRegistry().Copy(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_entry_roles.h b/cache/cache_entry_roles.h new file mode 100644 index 00000000000..22148e00c41 --- /dev/null +++ b/cache/cache_entry_roles.h @@ -0,0 +1,122 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { + +// Classifications of block cache entries, for reporting statistics +enum class CacheEntryRole { + // Block-based table data block + kDataBlock, + // Block-based table filter block (full or partitioned) + kFilterBlock, + // Block-based table metadata block for partitioned filter + kFilterMetaBlock, + // Block-based table deprecated filter block (old "block-based" filter) + kDeprecatedFilterBlock, + // Block-based table index block + kIndexBlock, + // Other kinds of block-based table block + kOtherBlock, + // WriteBufferManager reservations to account for memtable usage + kWriteBuffer, + // Default bucket, for miscellaneous cache entries. Do not use for + // entries that could potentially add up to large usage. + kMisc, +}; +constexpr uint32_t kNumCacheEntryRoles = + static_cast(CacheEntryRole::kMisc) + 1; + +extern std::array + kCacheEntryRoleToCamelString; +extern std::array + kCacheEntryRoleToHyphenString; + +// To associate cache entries with their role, we use a hack on the +// existing Cache interface. Because the deleter of an entry can authenticate +// the code origin of an entry, we can elaborate the choice of deleter to +// also encode role information, without inferring false role information +// from entries not choosing to encode a role. +// +// The rest of this file is for handling mappings between deleters and +// roles. + +// To infer a role from a deleter, the deleter must be registered. This +// can be done "manually" with this function. This function is thread-safe, +// and the registration mappings go into private but static storage. (Note +// that DeleterFn is a function pointer, not std::function. Registrations +// should not be too many.) +void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role); + +// Gets a copy of the registered deleter -> role mappings. This is the only +// function for reading the mappings made with RegisterCacheDeleterRole. +// Why only this interface for reading? +// * This function has to be thread safe, which could incur substantial +// overhead. We should not pay this overhead for every deleter look-up. +// * This is suitable for preparing for batch operations, like with +// CacheEntryStatsCollector. +// * The number of mappings should be sufficiently small (dozens). +std::unordered_map CopyCacheDeleterRoleMap(); + +// ************************************************************** // +// An automatic registration infrastructure. This enables code +// to simply ask for a deleter associated with a particular type +// and role, and registration is automatic. In a sense, this is +// a small dependency injection infrastructure, because linking +// in new deleter instantiations is essentially sufficient for +// making stats collection (using CopyCacheDeleterRoleMap) aware +// of them. + +namespace cache_entry_roles_detail { + +template +struct RegisteredDeleter { + RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); } + + // These have global linkage to help ensure compiler optimizations do not + // break uniqueness for each + static void Delete(const Slice& /* key */, void* value) { + delete static_cast(value); + } +}; + +template +struct RegisteredNoopDeleter { + RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); } + + static void Delete(const Slice& /* key */, void* value) { + (void)value; + assert(value == nullptr); + } +}; + +} // namespace cache_entry_roles_detail + +// Get an automatically registered deleter for value type T and role R. +// Based on C++ semantics, registration is invoked exactly once in a +// thread-safe way on first call to this function, for each . +template +Cache::DeleterFn GetCacheEntryDeleterForRole() { + static cache_entry_roles_detail::RegisteredDeleter reg; + return reg.Delete; +} + +// Get an automatically registered no-op deleter (value should be nullptr) +// and associated with role R. This is used for Cache "reservation" entries +// such as for WriteBufferManager. +template +Cache::DeleterFn GetNoopDeleterForRole() { + static cache_entry_roles_detail::RegisteredNoopDeleter reg; + return reg.Delete; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h new file mode 100644 index 00000000000..ecd387f8523 --- /dev/null +++ b/cache/cache_entry_stats.h @@ -0,0 +1,165 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_helpers.h" +#include "port/lang.h" +#include "rocksdb/cache.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "util/coding_lean.h" + +namespace ROCKSDB_NAMESPACE { + +// A generic helper object for gathering stats about cache entries by +// iterating over them with ApplyToAllEntries. This class essentially +// solves the problem of slowing down a Cache with too many stats +// collectors that could be sharing stat results, such as from multiple +// column families or multiple DBs sharing a Cache. We employ a few +// mitigations: +// * Only one collector for a particular kind of Stats is alive +// for each Cache. This is guaranteed using the Cache itself to hold +// the collector. +// * A mutex ensures only one thread is gathering stats for this +// collector. +// * The most recent gathered stats are saved and simply copied to +// satisfy requests within a time window (default: 3 minutes) of +// completion of the most recent stat gathering. +// +// Template parameter Stats must be copyable and trivially constructable, +// as well as... +// concept Stats { +// // Notification before applying callback to all entries +// void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); +// // Get the callback to apply to all entries. `callback` +// // type must be compatible with Cache::ApplyToAllEntries +// callback GetEntryCallback(); +// // Notification after applying callback to all entries +// void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); +// // Notification that a collection was skipped because of +// // sufficiently recent saved results. +// void SkippedCollection(); +// } +template +class CacheEntryStatsCollector { + public: + // Gathers stats and saves results into `stats` + // + // Maximum allowed age for a "hit" on saved results is determined by the + // two interval parameters. Both set to 0 forces a re-scan. For example + // with min_interval_seconds=300 and min_interval_factor=100, if the last + // scan took 10s, we would only rescan ("miss") if the age in seconds of + // the saved results is > max(300, 100*10). + // Justification: scans can vary wildly in duration, e.g. from 0.02 sec + // to as much as 20 seconds, so we want to be able to cap the absolute + // and relative frequency of scans. + void GetStats(Stats *stats, int min_interval_seconds, + int min_interval_factor) { + // Waits for any pending reader or writer (collector) + std::lock_guard lock(mutex_); + + uint64_t max_age_micros = + static_cast(std::max(min_interval_seconds, 0)) * 1000000U; + + if (last_end_time_micros_ > last_start_time_micros_ && + min_interval_factor > 0) { + max_age_micros = std::max( + max_age_micros, min_interval_factor * (last_end_time_micros_ - + last_start_time_micros_)); + } + + uint64_t start_time_micros = clock_->NowMicros(); + if ((start_time_micros - last_end_time_micros_) > max_age_micros) { + last_start_time_micros_ = start_time_micros; + saved_stats_.BeginCollection(cache_, clock_, start_time_micros); + + cache_->ApplyToAllEntries(saved_stats_.GetEntryCallback(), {}); + TEST_SYNC_POINT_CALLBACK( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr); + + uint64_t end_time_micros = clock_->NowMicros(); + last_end_time_micros_ = end_time_micros; + saved_stats_.EndCollection(cache_, clock_, end_time_micros); + } else { + saved_stats_.SkippedCollection(); + } + // Copy to caller + *stats = saved_stats_; + } + + Cache *GetCache() const { return cache_; } + + // Gets or creates a shared instance of CacheEntryStatsCollector in the + // cache itself, and saves into `ptr`. This shared_ptr will hold the + // entry in cache until all refs are destroyed. + static Status GetShared(Cache *cache, SystemClock *clock, + std::shared_ptr *ptr) { + std::array cache_key_data{ + {// First 16 bytes == md5 of class name + 0x7eba5a8fb5437c90U, 0x8ca68c9b11655855U, + // Last 8 bytes based on a function pointer to make unique for each + // template instantiation + reinterpret_cast(&CacheEntryStatsCollector::GetShared)}}; + Slice cache_key = GetSlice(&cache_key_data); + + Cache::Handle *h = cache->Lookup(cache_key); + if (h == nullptr) { + // Not yet in cache, but Cache doesn't provide a built-in way to + // avoid racing insert. So we double-check under a shared mutex, + // inspired by TableCache. + STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex); + std::lock_guard lock(static_mutex); + + h = cache->Lookup(cache_key); + if (h == nullptr) { + auto new_ptr = new CacheEntryStatsCollector(cache, clock); + // TODO: non-zero charge causes some tests that count block cache + // usage to go flaky. Fix the problem somehow so we can use an + // accurate charge. + size_t charge = 0; + Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h, + Cache::Priority::HIGH); + if (!s.ok()) { + assert(h == nullptr); + return s; + } + } + } + // If we reach here, shared entry is in cache with handle `h`. + assert(cache->GetDeleter(h) == Deleter); + + // Build an aliasing shared_ptr that keeps `ptr` in cache while there + // are references. + *ptr = MakeSharedCacheHandleGuard(cache, h); + return Status::OK(); + } + + private: + explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock) + : saved_stats_(), + last_start_time_micros_(0), + last_end_time_micros_(/*pessimistic*/ 10000000), + cache_(cache), + clock_(clock) {} + + static void Deleter(const Slice &, void *value) { + delete static_cast(value); + } + + std::mutex mutex_; + Stats saved_stats_; + uint64_t last_start_time_micros_; + uint64_t last_end_time_micros_; + Cache *const cache_; + SystemClock *const clock_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_helpers.h b/cache/cache_helpers.h new file mode 100644 index 00000000000..4b784939613 --- /dev/null +++ b/cache/cache_helpers.h @@ -0,0 +1,125 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/cache.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Returns the cached value given a cache handle. +template +T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) { + assert(cache); + assert(handle); + + return static_cast(cache->Value(handle)); +} + +// Simple generic deleter for Cache (to be used with Cache::Insert). +template +void DeleteCacheEntry(const Slice& /* key */, void* value) { + delete static_cast(value); +} + +// Turns a T* into a Slice so it can be used as a key with Cache. +template +Slice GetSlice(const T* t) { + return Slice(reinterpret_cast(t), sizeof(T)); +} + +// Generic resource management object for cache handles that releases the handle +// when destroyed. Has unique ownership of the handle, so copying it is not +// allowed, while moving it transfers ownership. +template +class CacheHandleGuard { + public: + CacheHandleGuard() = default; + + CacheHandleGuard(Cache* cache, Cache::Handle* handle) + : cache_(cache), + handle_(handle), + value_(GetFromCacheHandle(cache, handle)) { + assert(cache_ && handle_ && value_); + } + + CacheHandleGuard(const CacheHandleGuard&) = delete; + CacheHandleGuard& operator=(const CacheHandleGuard&) = delete; + + CacheHandleGuard(CacheHandleGuard&& rhs) noexcept + : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) { + assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_)); + + rhs.ResetFields(); + } + + CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept { + if (this == &rhs) { + return *this; + } + + ReleaseHandle(); + + cache_ = rhs.cache_; + handle_ = rhs.handle_; + value_ = rhs.value_; + + assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_)); + + rhs.ResetFields(); + + return *this; + } + + ~CacheHandleGuard() { ReleaseHandle(); } + + bool IsEmpty() const { return !handle_; } + + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return handle_; } + T* GetValue() const { return value_; } + + void Reset() { + ReleaseHandle(); + ResetFields(); + } + + private: + void ReleaseHandle() { + if (IsEmpty()) { + return; + } + + assert(cache_); + cache_->Release(handle_); + } + + void ResetFields() { + cache_ = nullptr; + handle_ = nullptr; + value_ = nullptr; + } + + private: + Cache* cache_ = nullptr; + Cache::Handle* handle_ = nullptr; + T* value_ = nullptr; +}; + +// Build an aliasing shared_ptr that keeps `handle` in cache while there +// are references, but the pointer is to the value for that cache entry, +// which must be of type T. This is copyable, unlike CacheHandleGuard, but +// does not provide access to caching details. +template +std::shared_ptr MakeSharedCacheHandleGuard(Cache* cache, + Cache::Handle* handle) { + auto wrapper = std::make_shared>(cache, handle); + return std::shared_ptr(wrapper, static_cast(cache->Value(handle))); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 40d8c42cc7c..66881b5c998 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -712,25 +712,98 @@ TEST_P(CacheTest, OverCapacity) { } namespace { -std::vector> callback_state; -void callback(void* entry, size_t charge) { - callback_state.push_back({DecodeValue(entry), static_cast(charge)}); +std::vector> legacy_callback_state; +void legacy_callback(void* value, size_t charge) { + legacy_callback_state.push_back( + {DecodeValue(value), static_cast(charge)}); } }; -TEST_P(CacheTest, ApplyToAllCacheEntiresTest) { +TEST_P(CacheTest, ApplyToAllCacheEntriesTest) { std::vector> inserted; - callback_state.clear(); + legacy_callback_state.clear(); for (int i = 0; i < 10; ++i) { Insert(i, i * 2, i + 1); inserted.push_back({i * 2, i + 1}); } - cache_->ApplyToAllCacheEntries(callback, true); + cache_->ApplyToAllCacheEntries(legacy_callback, true); + + std::sort(inserted.begin(), inserted.end()); + std::sort(legacy_callback_state.begin(), legacy_callback_state.end()); + ASSERT_EQ(inserted.size(), legacy_callback_state.size()); + for (size_t i = 0; i < inserted.size(); ++i) { + EXPECT_EQ(inserted[i], legacy_callback_state[i]); + } +} + +TEST_P(CacheTest, ApplyToAllEntriesTest) { + std::vector callback_state; + const auto callback = [&](const Slice& key, void* value, size_t charge, + Cache::DeleterFn deleter) { + callback_state.push_back(ToString(DecodeKey(key)) + "," + + ToString(DecodeValue(value)) + "," + + ToString(charge)); + assert(deleter == &CacheTest::Deleter); + }; + + std::vector inserted; + callback_state.clear(); + + for (int i = 0; i < 10; ++i) { + Insert(i, i * 2, i + 1); + inserted.push_back(ToString(i) + "," + ToString(i * 2) + "," + + ToString(i + 1)); + } + cache_->ApplyToAllEntries(callback, /*opts*/ {}); std::sort(inserted.begin(), inserted.end()); std::sort(callback_state.begin(), callback_state.end()); - ASSERT_TRUE(inserted == callback_state); + ASSERT_EQ(inserted.size(), callback_state.size()); + for (size_t i = 0; i < inserted.size(); ++i) { + EXPECT_EQ(inserted[i], callback_state[i]); + } +} + +TEST_P(CacheTest, ApplyToAllEntriesDuringResize) { + // This is a mini-stress test of ApplyToAllEntries, to ensure + // items in the cache that are neither added nor removed + // during ApplyToAllEntries are counted exactly once. + + // Insert some entries that we expect to be seen exactly once + // during iteration. + constexpr int kSpecialCharge = 2; + constexpr int kNotSpecialCharge = 1; + constexpr int kSpecialCount = 100; + for (int i = 0; i < kSpecialCount; ++i) { + Insert(i, i * 2, kSpecialCharge); + } + + // For callback + int special_count = 0; + const auto callback = [&](const Slice&, void*, size_t charge, + Cache::DeleterFn) { + if (charge == static_cast(kSpecialCharge)) { + ++special_count; + } + }; + + // Start counting + std::thread apply_thread([&]() { + // Use small average_entries_per_lock to make the problem difficult + Cache::ApplyToAllEntriesOptions opts; + opts.average_entries_per_lock = 2; + cache_->ApplyToAllEntries(callback, opts); + }); + + // In parallel, add more entries, enough to cause resize but not enough + // to cause ejections + for (int i = kSpecialCount * 1; i < kSpecialCount * 6; ++i) { + Insert(i, i * 2, kNotSpecialCharge); + } + + apply_thread.join(); + ASSERT_EQ(special_count, kSpecialCount); } TEST_P(CacheTest, DefaultShardBits) { @@ -749,11 +822,12 @@ TEST_P(CacheTest, DefaultShardBits) { ASSERT_EQ(6, sc->GetNumShardBits()); } -TEST_P(CacheTest, GetCharge) { +TEST_P(CacheTest, GetChargeAndDeleter) { Insert(1, 2); Cache::Handle* h1 = cache_->Lookup(EncodeKey(1)); ASSERT_EQ(2, DecodeValue(cache_->Value(h1))); ASSERT_EQ(1, cache_->GetCharge(h1)); + ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1)); cache_->Release(h1); } diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 7934b378bdb..a3cb7d2c3f9 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -176,10 +176,13 @@ namespace { // Cache entry meta data. struct CacheHandle { Slice key; - uint32_t hash; void* value; size_t charge; - void (*deleter)(const Slice&, void* value); + Cache::DeleterFn deleter; + uint32_t hash; + + // Addition to "charge" to get "total charge" under metadata policy. + uint32_t meta_charge; // Flags and counters associated with the cache handle: // lowest bit: in-cache bit @@ -205,9 +208,8 @@ struct CacheHandle { return *this; } - inline static size_t CalcTotalCharge( - Slice key, size_t charge, - CacheMetadataChargePolicy metadata_charge_policy) { + inline static uint32_t CalcMetadataCharge( + Slice key, CacheMetadataChargePolicy metadata_charge_policy) { size_t meta_charge = 0; if (metadata_charge_policy == kFullChargeCacheMetadata) { meta_charge += sizeof(CacheHandle); @@ -218,13 +220,11 @@ struct CacheHandle { meta_charge += key.size(); #endif } - return charge + meta_charge; + assert(meta_charge <= UINT32_MAX); + return static_cast(meta_charge); } - inline size_t CalcTotalCharge( - CacheMetadataChargePolicy metadata_charge_policy) { - return CalcTotalCharge(key, charge, metadata_charge_policy); - } + inline size_t GetTotalCharge() { return charge + meta_charge; } }; // Key of hash map. We store hash value with the key for convenience. @@ -271,7 +271,25 @@ class ClockCacheShard final : public CacheShard { Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), Cache::Handle** handle, Cache::Priority priority) override; + Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::Handle** handle, Cache::Priority priority) override { + return Insert(key, hash, value, charge, helper->del_cb, handle, priority); + } Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; + Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* /*helper*/, + const Cache::CreateCallback& /*create_cb*/, + Cache::Priority /*priority*/, bool /*wait*/) override { + return Lookup(key, hash); + } + bool Release(Cache::Handle* handle, bool /*useful*/, + bool force_erase) override { + return Release(handle, force_erase); + } + bool IsReady(Cache::Handle* /*handle*/) override { return true; } + void Wait(Cache::Handle* /*handle*/) override {} + // If the entry in in cache, increase reference count and return true. // Return false otherwise. // @@ -284,8 +302,10 @@ class ClockCacheShard final : public CacheShard { size_t GetUsage() const override; size_t GetPinnedUsage() const override; void EraseUnRefEntries() override; - void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override; + void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) override; private: static const uint32_t kInCacheBit = 1; @@ -404,22 +424,46 @@ size_t ClockCacheShard::GetPinnedUsage() const { return pinned_usage_.load(std::memory_order_relaxed); } -void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) { - if (thread_safe) { - mutex_.Lock(); +void ClockCacheShard::ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) { + assert(average_entries_per_lock > 0); + MutexLock lock(&mutex_); + + // Figure out the range to iterate, update `state` + size_t list_size = list_.size(); + size_t start_idx = *state; + size_t end_idx = start_idx + average_entries_per_lock; + if (start_idx > list_size) { + // Shouldn't reach here, but recoverable + assert(false); + // Mark finished with all + *state = UINT32_MAX; + return; + } + if (end_idx >= list_size || end_idx >= UINT32_MAX) { + // This also includes the hypothetical case of >4 billion + // cache handles. + end_idx = list_size; + // Mark finished with all + *state = UINT32_MAX; + } else { + *state = static_cast(end_idx); } - for (auto& handle : list_) { - // Use relaxed semantics instead of acquire semantics since we are either - // holding mutex, or don't have thread safe requirement. + + // Do the iteration + auto cur = list_.begin() + start_idx; + auto end = list_.begin() + end_idx; + for (; cur != end; ++cur) { + const CacheHandle& handle = *cur; + // Use relaxed semantics instead of acquire semantics since we are + // holding mutex uint32_t flags = handle.flags.load(std::memory_order_relaxed); if (InCache(flags)) { - callback(handle.value, handle.charge); + callback(handle.key, handle.value, handle.charge, handle.deleter); } } - if (thread_safe) { - mutex_.Unlock(); - } } void ClockCacheShard::RecycleHandle(CacheHandle* handle, @@ -428,10 +472,8 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle, assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0); context->to_delete_key.push_back(handle->key.data()); context->to_delete_value.emplace_back(*handle); - size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); - handle->key.clear(); - handle->value = nullptr; - handle->deleter = nullptr; + size_t total_charge = handle->GetTotalCharge(); + // clearing `handle` fields would go here but not strictly required recycle_.push_back(handle); usage_.fetch_sub(total_charge, std::memory_order_relaxed); } @@ -459,7 +501,7 @@ bool ClockCacheShard::Ref(Cache::Handle* h) { std::memory_order_relaxed)) { if (CountRefs(flags) == 0) { // No reference count before the operation. - size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); + size_t total_charge = handle->GetTotalCharge(); pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } return true; @@ -473,6 +515,11 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage, if (set_usage) { handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed); } + // If the handle reaches state refs=0 and InCache=true after this + // atomic operation then we cannot access `handle` afterward, because + // it could be evicted before we access the `handle`. + size_t total_charge = handle->GetTotalCharge(); + // Use acquire-release semantics as previous operations on the cache entry // has to be order before reference count is decreased, and potential cleanup // of the entry has to be order after. @@ -480,7 +527,6 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage, assert(CountRefs(flags) > 0); if (CountRefs(flags) == 1) { // this is the last reference. - size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed); // Cleanup if it is the last reference. if (!InCache(flags)) { @@ -567,8 +613,9 @@ CacheHandle* ClockCacheShard::Insert( void (*deleter)(const Slice& key, void* value), bool hold_reference, CleanupContext* context, bool* overwritten) { assert(overwritten != nullptr && *overwritten == false); - size_t total_charge = - CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_); + uint32_t meta_charge = + CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_); + size_t total_charge = charge + meta_charge; MutexLock l(&mutex_); bool success = EvictFromCache(total_charge, context); bool strict = strict_capacity_limit_.load(std::memory_order_relaxed); @@ -594,8 +641,18 @@ CacheHandle* ClockCacheShard::Insert( handle->hash = hash; handle->value = value; handle->charge = charge; + handle->meta_charge = meta_charge; handle->deleter = deleter; uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit; + + // TODO investigate+fix suspected race condition: + // [thread 1] Lookup starts, up to Ref() + // [thread 2] Erase/evict the entry just looked up + // [thread 1] Ref() the handle, even though it's in the recycle bin + // [thread 2] Insert with recycling that handle + // Here we obliterate the other thread's Ref + // Possible fix: never blindly overwrite the flags, but only make + // relative updates (fetch_add, etc). handle->flags.store(flags, std::memory_order_relaxed); HashTable::accessor accessor; if (table_.find(accessor, CacheKey(key, hash))) { @@ -726,11 +783,11 @@ class ClockCache final : public ShardedCache { const char* Name() const override { return "ClockCache"; } - CacheShard* GetShard(int shard) override { + CacheShard* GetShard(uint32_t shard) override { return reinterpret_cast(&shards_[shard]); } - const CacheShard* GetShard(int shard) const override { + const CacheShard* GetShard(uint32_t shard) const override { return reinterpret_cast(&shards_[shard]); } @@ -746,7 +803,17 @@ class ClockCache final : public ShardedCache { return reinterpret_cast(handle)->hash; } - void DisownData() override { shards_ = nullptr; } + DeleterFn GetDeleter(Handle* handle) const override { + return reinterpret_cast(handle)->deleter; + } + + void DisownData() override { +#ifndef MUST_FREE_HEAP_ALLOCATIONS + shards_ = nullptr; +#endif + } + + void WaitAll(std::vector& /*handles*/) override {} private: ClockCacheShard* shards_; diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 04e612bdbcc..f7da46b69f8 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -9,26 +9,28 @@ #include "cache/lru_cache.h" -#include -#include -#include -#include +#include +#include +#include #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { -LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) { - Resize(); -} +LRUHandleTable::LRUHandleTable(int max_upper_hash_bits) + : length_bits_(/* historical starting size*/ 4), + list_(new LRUHandle* [size_t{1} << length_bits_] {}), + elems_(0), + max_length_bits_(max_upper_hash_bits) {} LRUHandleTable::~LRUHandleTable() { - ApplyToAllCacheEntries([](LRUHandle* h) { - if (!h->HasRefs()) { - h->Free(); - } - }); - delete[] list_; + ApplyToEntriesRange( + [](LRUHandle* h) { + if (!h->HasRefs()) { + h->Free(); + } + }, + 0, uint32_t{1} << length_bits_); } LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) { @@ -42,7 +44,7 @@ LRUHandle* LRUHandleTable::Insert(LRUHandle* h) { *ptr = h; if (old == nullptr) { ++elems_; - if (elems_ > length_) { + if ((elems_ >> length_bits_) > 0) { // elems_ >= length // Since each cache entry is fairly large, we aim for a small // average linked list length (<= 1). Resize(); @@ -62,7 +64,7 @@ LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) { } LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) { - LRUHandle** ptr = &list_[hash & (length_ - 1)]; + LRUHandle** ptr = &list_[hash >> (32 - length_bits_)]; while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) { ptr = &(*ptr)->next_hash; } @@ -70,19 +72,29 @@ LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) { } void LRUHandleTable::Resize() { - uint32_t new_length = 16; - while (new_length < elems_ * 1.5) { - new_length *= 2; + if (length_bits_ >= max_length_bits_) { + // Due to reaching limit of hash information, if we made the table + // bigger, we would allocate more addresses but only the same + // number would be used. + return; } - LRUHandle** new_list = new LRUHandle*[new_length]; - memset(new_list, 0, sizeof(new_list[0]) * new_length); + if (length_bits_ >= 31) { + // Avoid undefined behavior shifting uint32_t by 32 + return; + } + + uint32_t old_length = uint32_t{1} << length_bits_; + int new_length_bits = length_bits_ + 1; + std::unique_ptr new_list { + new LRUHandle* [size_t{1} << new_length_bits] {} + }; uint32_t count = 0; - for (uint32_t i = 0; i < length_; i++) { + for (uint32_t i = 0; i < old_length; i++) { LRUHandle* h = list_[i]; while (h != nullptr) { LRUHandle* next = h->next_hash; uint32_t hash = h->hash; - LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)]; h->next_hash = *ptr; *ptr = h; h = next; @@ -90,23 +102,25 @@ void LRUHandleTable::Resize() { } } assert(elems_ == count); - delete[] list_; - list_ = new_list; - length_ = new_length; + list_ = std::move(new_list); + length_bits_ = new_length_bits; } -LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, - double high_pri_pool_ratio, - bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) +LRUCacheShard::LRUCacheShard( + size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, + bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy, + int max_upper_hash_bits, + const std::shared_ptr& secondary_cache) : capacity_(0), high_pri_pool_usage_(0), strict_capacity_limit_(strict_capacity_limit), high_pri_pool_ratio_(high_pri_pool_ratio), high_pri_pool_capacity_(0), + table_(max_upper_hash_bits), usage_(0), lru_usage_(0), - mutex_(use_adaptive_mutex) { + mutex_(use_adaptive_mutex), + secondary_cache_(secondary_cache) { set_metadata_charge_policy(metadata_charge_policy); // Make empty circular linked list lru_.next = &lru_; @@ -138,19 +152,40 @@ void LRUCacheShard::EraseUnRefEntries() { } } -void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) { - const auto applyCallback = [&]() { - table_.ApplyToAllCacheEntries( - [callback](LRUHandle* h) { callback(h->value, h->charge); }); - }; - - if (thread_safe) { - MutexLock l(&mutex_); - applyCallback(); +void LRUCacheShard::ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) { + // The state is essentially going to be the starting hash, which works + // nicely even if we resize between calls because we use upper-most + // hash bits for table indexes. + MutexLock l(&mutex_); + uint32_t length_bits = table_.GetLengthBits(); + uint32_t length = uint32_t{1} << length_bits; + + assert(average_entries_per_lock > 0); + // Assuming we are called with same average_entries_per_lock repeatedly, + // this simplifies some logic (index_end will not overflow) + assert(average_entries_per_lock < length || *state == 0); + + uint32_t index_begin = *state >> (32 - length_bits); + uint32_t index_end = index_begin + average_entries_per_lock; + if (index_end >= length) { + // Going to end + index_end = length; + *state = UINT32_MAX; } else { - applyCallback(); + *state = index_end << (32 - length_bits); } + + table_.ApplyToEntriesRange( + [callback](LRUHandle* h) { + DeleterFn deleter = h->IsSecondaryCacheCompatible() + ? h->info_.helper->del_cb + : h->info_.deleter; + callback(h->key(), h->value, h->charge, deleter); + }, + index_begin, index_end); } void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) { @@ -257,8 +292,14 @@ void LRUCacheShard::SetCapacity(size_t capacity) { EvictFromLRU(0, &last_reference_list); } + // Try to insert the evicted entries into tiered cache // Free the entries outside of mutex for performance reasons for (auto entry : last_reference_list) { + if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && + !entry->IsPromoted()) { + secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper) + .PermitUncheckedError(); + } entry->Free(); } } @@ -268,17 +309,176 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { strict_capacity_limit_ = strict_capacity_limit; } -Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { - MutexLock l(&mutex_); - LRUHandle* e = table_.Lookup(key, hash); - if (e != nullptr) { - assert(e->InCache()); - if (!e->HasRefs()) { - // The entry is in LRU since it's in hash and has no external references - LRU_Remove(e); +Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle, + bool free_handle_on_fail) { + Status s = Status::OK(); + autovector last_reference_list; + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + + { + MutexLock l(&mutex_); + + // Free the space following strict LRU policy until enough space + // is freed or the lru list is empty + EvictFromLRU(total_charge, &last_reference_list); + + if ((usage_ + total_charge) > capacity_ && + (strict_capacity_limit_ || handle == nullptr)) { + e->SetInCache(false); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry inserted + // into cache and get evicted immediately. + last_reference_list.push_back(e); + } else { + if (free_handle_on_fail) { + delete[] reinterpret_cast(e); + *handle = nullptr; + } + s = Status::Incomplete("Insert failed due to LRU cache being full."); + } + } else { + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. + LRUHandle* old = table_.Insert(e); + usage_ += total_charge; + if (old != nullptr) { + s = Status::OkOverwritten(); + assert(old->InCache()); + old->SetInCache(false); + if (!old->HasRefs()) { + // old is on LRU because it's in cache and its reference count is 0 + LRU_Remove(old); + size_t old_total_charge = + old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= old_total_charge); + usage_ -= old_total_charge; + last_reference_list.push_back(old); + } + } + if (handle == nullptr) { + LRU_Insert(e); + } else { + e->Ref(); + *handle = reinterpret_cast(e); + } + } + } + + // Try to insert the evicted entries into the secondary cache + // Free the entries here outside of mutex for performance reasons + for (auto entry : last_reference_list) { + if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && + !entry->IsPromoted()) { + secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper) + .PermitUncheckedError(); + } + entry->Free(); + } + + return s; +} + +void LRUCacheShard::Promote(LRUHandle* e) { + SecondaryCacheResultHandle* secondary_handle = e->sec_handle; + + assert(secondary_handle->IsReady()); + e->SetIncomplete(false); + e->SetInCache(true); + e->SetPromoted(true); + e->value = secondary_handle->Value(); + e->charge = secondary_handle->Size(); + delete secondary_handle; + + // This call could fail if the cache is over capacity and + // strict_capacity_limit_ is true. In such a case, we don't want + // InsertItem() to free the handle, since the item is already in memory + // and the caller will most likely just read from disk if we erase it here. + if (e->value) { + Cache::Handle* handle = reinterpret_cast(e); + Status s = InsertItem(e, &handle, /*free_handle_on_fail=*/false); + if (s.ok()) { + // InsertItem would have taken a reference on the item, so decrement it + // here as we expect the caller to already hold a reference + e->Unref(); + } else { + // Item is in memory, but not accounted against the cache capacity. + // When the handle is released, the item should get deleted + assert(!e->InCache()); + } + } else { + // Since the secondary cache lookup failed, mark the item as not in cache + // and charge the cache only for metadata usage, i.e handle, key etc + MutexLock l(&mutex_); + e->charge = 0; + e->SetInCache(false); + usage_ += e->CalcTotalCharge(metadata_charge_policy_); + } +} + +Cache::Handle* LRUCacheShard::Lookup( + const Slice& key, uint32_t hash, + const ShardedCache::CacheItemHelper* helper, + const ShardedCache::CreateCallback& create_cb, Cache::Priority priority, + bool wait) { + LRUHandle* e = nullptr; + { + MutexLock l(&mutex_); + e = table_.Lookup(key, hash); + if (e != nullptr) { + assert(e->InCache()); + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references + LRU_Remove(e); + } + e->Ref(); + e->SetHit(); + } + } + + // If handle table lookup failed, then allocate a handle outside the + // mutex if we're going to lookup in the secondary cache + // Only support synchronous for now + // TODO: Support asynchronous lookup in secondary cache + if (!e && secondary_cache_ && helper && helper->saveto_cb) { + // For objects from the secondary cache, we expect the caller to provide + // a way to create/delete the primary cache object. The only case where + // a deleter would not be required is for dummy entries inserted for + // accounting purposes, which we won't demote to the secondary cache + // anyway. + assert(create_cb && helper->del_cb); + std::unique_ptr secondary_handle = + secondary_cache_->Lookup(key, create_cb, wait); + if (secondary_handle != nullptr) { + e = reinterpret_cast( + new char[sizeof(LRUHandle) - 1 + key.size()]); + + e->flags = 0; + e->SetSecondaryCacheCompatible(true); + e->info_.helper = helper; + e->key_length = key.size(); + e->hash = hash; + e->refs = 0; + e->next = e->prev = nullptr; + e->SetPriority(priority); + memcpy(e->key_data, key.data(), key.size()); + e->value = nullptr; + e->sec_handle = secondary_handle.release(); + e->Ref(); + + if (wait) { + Promote(e); + if (!e->value) { + // The secondary cache returned a handle, but the lookup failed + e->Unref(); + e->Free(); + e = nullptr; + } + } else { + // If wait is false, we always return a handle and let the caller + // release the handle after checking for success or failure + e->SetIncomplete(true); + } } - e->Ref(); - e->SetHit(); } return reinterpret_cast(e); } @@ -339,81 +539,32 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) { Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), + const Cache::CacheItemHelper* helper, Cache::Handle** handle, Cache::Priority priority) { // Allocate the memory here outside of the mutex // If the cache is full, we'll have to release it // It shouldn't happen very often though. LRUHandle* e = reinterpret_cast( new char[sizeof(LRUHandle) - 1 + key.size()]); - Status s = Status::OK(); - autovector last_reference_list; e->value = value; - e->deleter = deleter; + e->flags = 0; + if (helper) { + e->SetSecondaryCacheCompatible(true); + e->info_.helper = helper; + } else { + e->info_.deleter = deleter; + } e->charge = charge; e->key_length = key.size(); - e->flags = 0; e->hash = hash; e->refs = 0; e->next = e->prev = nullptr; e->SetInCache(true); e->SetPriority(priority); memcpy(e->key_data, key.data(), key.size()); - size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); - { - MutexLock l(&mutex_); - - // Free the space following strict LRU policy until enough space - // is freed or the lru list is empty - EvictFromLRU(total_charge, &last_reference_list); - - if ((usage_ + total_charge) > capacity_ && - (strict_capacity_limit_ || handle == nullptr)) { - if (handle == nullptr) { - // Don't insert the entry but still return ok, as if the entry inserted - // into cache and get evicted immediately. - e->SetInCache(false); - last_reference_list.push_back(e); - } else { - delete[] reinterpret_cast(e); - *handle = nullptr; - s = Status::Incomplete("Insert failed due to LRU cache being full."); - } - } else { - // Insert into the cache. Note that the cache might get larger than its - // capacity if not enough space was freed up. - LRUHandle* old = table_.Insert(e); - usage_ += total_charge; - if (old != nullptr) { - s = Status::OkOverwritten(); - assert(old->InCache()); - old->SetInCache(false); - if (!old->HasRefs()) { - // old is on LRU because it's in cache and its reference count is 0 - LRU_Remove(old); - size_t old_total_charge = - old->CalcTotalCharge(metadata_charge_policy_); - assert(usage_ >= old_total_charge); - usage_ -= old_total_charge; - last_reference_list.push_back(old); - } - } - if (handle == nullptr) { - LRU_Insert(e); - } else { - e->Ref(); - *handle = reinterpret_cast(e); - } - } - } - - // Free the entries here outside of mutex for performance reasons - for (auto entry : last_reference_list) { - entry->Free(); - } - - return s; + return InsertItem(e, handle, /* free_handle_on_fail */ true); } void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { @@ -443,6 +594,18 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { } } +bool LRUCacheShard::IsReady(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + MutexLock l(&mutex_); + bool ready = true; + if (e->IsPending()) { + assert(secondary_cache_); + assert(e->sec_handle); + ready = e->sec_handle->IsReady(); + } + return ready; +} + size_t LRUCacheShard::GetUsage() const { MutexLock l(&mutex_); return usage_; @@ -469,7 +632,8 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) + CacheMetadataChargePolicy metadata_charge_policy, + const std::shared_ptr& secondary_cache) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, std::move(allocator)) { num_shards_ = 1 << num_shard_bits; @@ -477,10 +641,12 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits, port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_)); size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_; for (int i = 0; i < num_shards_; i++) { - new (&shards_[i]) - LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio, - use_adaptive_mutex, metadata_charge_policy); + new (&shards_[i]) LRUCacheShard( + per_shard, strict_capacity_limit, high_pri_pool_ratio, + use_adaptive_mutex, metadata_charge_policy, + /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache); } + secondary_cache_ = secondary_cache; } LRUCache::~LRUCache() { @@ -493,11 +659,11 @@ LRUCache::~LRUCache() { } } -CacheShard* LRUCache::GetShard(int shard) { +CacheShard* LRUCache::GetShard(uint32_t shard) { return reinterpret_cast(&shards_[shard]); } -const CacheShard* LRUCache::GetShard(int shard) const { +const CacheShard* LRUCache::GetShard(uint32_t shard) const { return reinterpret_cast(&shards_[shard]); } @@ -509,23 +675,25 @@ size_t LRUCache::GetCharge(Handle* handle) const { return reinterpret_cast(handle)->charge; } +Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const { + auto h = reinterpret_cast(handle); + if (h->IsSecondaryCacheCompatible()) { + return h->info_.helper->del_cb; + } else { + return h->info_.deleter; + } +} + uint32_t LRUCache::GetHash(Handle* handle) const { return reinterpret_cast(handle)->hash; } void LRUCache::DisownData() { // Do not drop data if compile with ASAN to suppress leak warning. -#if defined(__clang__) -#if !defined(__has_feature) || !__has_feature(address_sanitizer) +#ifndef MUST_FREE_HEAP_ALLOCATIONS shards_ = nullptr; num_shards_ = 0; #endif -#else // __clang__ -#ifndef __SANITIZE_ADDRESS__ - shards_ = nullptr; - num_shards_ = 0; -#endif // !__SANITIZE_ADDRESS__ -#endif // __clang__ } size_t LRUCache::TEST_GetLRUSize() { @@ -544,19 +712,42 @@ double LRUCache::GetHighPriPoolRatio() { return result; } -std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { - return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, - cache_opts.high_pri_pool_ratio, - cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, - cache_opts.metadata_charge_policy); +void LRUCache::WaitAll(std::vector& handles) { + if (secondary_cache_) { + std::vector sec_handles; + sec_handles.reserve(handles.size()); + for (Handle* handle : handles) { + if (!handle) { + continue; + } + LRUHandle* lru_handle = reinterpret_cast(handle); + if (!lru_handle->IsPending()) { + continue; + } + sec_handles.emplace_back(lru_handle->sec_handle); + } + secondary_cache_->WaitAll(sec_handles); + for (Handle* handle : handles) { + if (!handle) { + continue; + } + LRUHandle* lru_handle = reinterpret_cast(handle); + if (!lru_handle->IsPending()) { + continue; + } + uint32_t hash = GetHash(handle); + LRUCacheShard* shard = static_cast(GetShard(Shard(hash))); + shard->Promote(lru_handle); + } + } } std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy) { + CacheMetadataChargePolicy metadata_charge_policy, + const std::shared_ptr& secondary_cache) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } @@ -569,7 +760,25 @@ std::shared_ptr NewLRUCache( } return std::make_shared( capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, - std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy); + std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy, + secondary_cache); +} + +std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { + return NewLRUCache( + cache_opts.capacity, cache_opts.num_shard_bits, + cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy, cache_opts.secondary_cache); } +std::shared_ptr NewLRUCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + double high_pri_pool_ratio, + std::shared_ptr memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) { + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, + metadata_charge_policy, nullptr); +} } // namespace ROCKSDB_NAMESPACE diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 827e0bece2b..af0155ad9fb 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). @@ -8,12 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include #include "cache/sharded_cache.h" - #include "port/malloc.h" #include "port/port.h" +#include "rocksdb/secondary_cache.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -49,8 +50,18 @@ namespace ROCKSDB_NAMESPACE { struct LRUHandle { void* value; - void (*deleter)(const Slice&, void* value); - LRUHandle* next_hash; + union Info { + Info() {} + ~Info() {} + Cache::DeleterFn deleter; + const ShardedCache::CacheItemHelper* helper; + } info_; + // An entry is not added to the LRUHandleTable until the secondary cache + // lookup is complete, so its safe to have this union. + union { + LRUHandle* next_hash; + SecondaryCacheResultHandle* sec_handle; + }; LRUHandle* next; LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? @@ -67,8 +78,14 @@ struct LRUHandle { IS_HIGH_PRI = (1 << 1), // Whether this entry is in high-pri pool. IN_HIGH_PRI_POOL = (1 << 2), - // Wwhether this entry has had any lookups (hits). + // Whether this entry has had any lookups (hits). HAS_HIT = (1 << 3), + // Can this be inserted into the tiered cache + IS_TIERED_CACHE_COMPATIBLE = (1 << 4), + // Is the handle still being read from a lower tier + IS_PENDING = (1 << 5), + // Has the item been promoted from a lower tier + IS_PROMOTED = (1 << 6), }; uint8_t flags; @@ -95,6 +112,11 @@ struct LRUHandle { bool IsHighPri() const { return flags & IS_HIGH_PRI; } bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; } bool HasHit() const { return flags & HAS_HIT; } + bool IsSecondaryCacheCompatible() const { + return flags & IS_TIERED_CACHE_COMPATIBLE; + } + bool IsPending() const { return flags & IS_PENDING; } + bool IsPromoted() const { return flags & IS_PROMOTED; } void SetInCache(bool in_cache) { if (in_cache) { @@ -122,15 +144,50 @@ struct LRUHandle { void SetHit() { flags |= HAS_HIT; } + void SetSecondaryCacheCompatible(bool tiered) { + if (tiered) { + flags |= IS_TIERED_CACHE_COMPATIBLE; + } else { + flags &= ~IS_TIERED_CACHE_COMPATIBLE; + } + } + + void SetIncomplete(bool incomp) { + if (incomp) { + flags |= IS_PENDING; + } else { + flags &= ~IS_PENDING; + } + } + + void SetPromoted(bool promoted) { + if (promoted) { + flags |= IS_PROMOTED; + } else { + flags &= ~IS_PROMOTED; + } + } + void Free() { assert(refs == 0); - if (deleter) { - (*deleter)(key(), value); + if (!IsSecondaryCacheCompatible() && info_.deleter) { + (*info_.deleter)(key(), value); + } else if (IsSecondaryCacheCompatible()) { + if (IsPending()) { + assert(sec_handle != nullptr); + SecondaryCacheResultHandle* tmp_sec_handle = sec_handle; + tmp_sec_handle->Wait(); + value = tmp_sec_handle->Value(); + delete tmp_sec_handle; + } + if (value) { + (*info_.helper->del_cb)(key(), value); + } } delete[] reinterpret_cast(this); } - // Caclculate the memory usage by metadata + // Calculate the memory usage by metadata inline size_t CalcTotalCharge( CacheMetadataChargePolicy metadata_charge_policy) { size_t meta_charge = 0; @@ -153,7 +210,10 @@ struct LRUHandle { // 4.4.3's builtin hashtable. class LRUHandleTable { public: - LRUHandleTable(); + // If the table uses more hash bits than `max_upper_hash_bits`, + // it will eat into the bits used for sharding, which are constant + // for a given LRUHandleTable. + explicit LRUHandleTable(int max_upper_hash_bits); ~LRUHandleTable(); LRUHandle* Lookup(const Slice& key, uint32_t hash); @@ -161,8 +221,8 @@ class LRUHandleTable { LRUHandle* Remove(const Slice& key, uint32_t hash); template - void ApplyToAllCacheEntries(T func) { - for (uint32_t i = 0; i < length_; i++) { + void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) { + for (uint32_t i = index_begin; i < index_end; i++) { LRUHandle* h = list_[i]; while (h != nullptr) { auto n = h->next_hash; @@ -173,6 +233,8 @@ class LRUHandleTable { } } + int GetLengthBits() const { return length_bits_; } + private: // Return a pointer to slot that points to a cache entry that // matches key/hash. If there is no such cache entry, return a @@ -181,11 +243,19 @@ class LRUHandleTable { void Resize(); + // Number of hash bits (upper because lower bits used for sharding) + // used for table index. Length == 1 << length_bits_ + int length_bits_; + // The table consists of an array of buckets where each bucket is // a linked list of cache entries that hash into the bucket. - LRUHandle** list_; - uint32_t length_; + std::unique_ptr list_; + + // Number of elements currently in the table uint32_t elems_; + + // Set from max_upper_hash_bits (see constructor) + const int max_length_bits_; }; // A single shard of sharded cache. @@ -193,7 +263,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { public: LRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy); + CacheMetadataChargePolicy metadata_charge_policy, + int max_upper_hash_bits, + const std::shared_ptr& secondary_cache); virtual ~LRUCacheShard() override = default; // Separate from constructor so caller can easily make an array of LRUCache @@ -209,11 +281,34 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { // Like Cache methods, but with an extra "hash" parameter. virtual Status Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, - void (*deleter)(const Slice& key, void* value), + size_t charge, Cache::DeleterFn deleter, Cache::Handle** handle, - Cache::Priority priority) override; - virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override; + Cache::Priority priority) override { + return Insert(key, hash, value, charge, deleter, nullptr, handle, priority); + } + virtual Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, + Cache::Handle** handle, + Cache::Priority priority) override { + assert(helper); + return Insert(key, hash, value, charge, nullptr, helper, handle, priority); + } + // If helper_cb is null, the values of the following arguments don't + // matter + virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const ShardedCache::CacheItemHelper* helper, + const ShardedCache::CreateCallback& create_cb, + ShardedCache::Priority priority, + bool wait) override; + virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override { + return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true); + } + virtual bool Release(Cache::Handle* handle, bool /*useful*/, + bool force_erase) override { + return Release(handle, force_erase); + } + virtual bool IsReady(Cache::Handle* /*handle*/) override; + virtual void Wait(Cache::Handle* /*handle*/) override {} virtual bool Ref(Cache::Handle* handle) override; virtual bool Release(Cache::Handle* handle, bool force_erase = false) override; @@ -226,8 +321,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { virtual size_t GetUsage() const override; virtual size_t GetPinnedUsage() const override; - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override; + virtual void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) override; virtual void EraseUnRefEntries() override; @@ -239,10 +336,27 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { // not threadsafe size_t TEST_GetLRUSize(); - // Retrives high pri pool ratio + // Retrieves high pri pool ratio double GetHighPriPoolRatio(); private: + friend class LRUCache; + // Insert an item into the hash table and, if handle is null, insert into + // the LRU list. Older items are evicted as necessary. If the cache is full + // and free_handle_on_fail is true, the item is deleted and handle is set to. + Status InsertItem(LRUHandle* item, Cache::Handle** handle, + bool free_handle_on_fail); + Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, + DeleterFn deleter, const Cache::CacheItemHelper* helper, + Cache::Handle** handle, Cache::Priority priority); + // Promote an item looked up from the secondary cache to the LRU cache. The + // item is only inserted into the hash table and not the LRU list, and only + // if the cache is not at full capacity, as is the case during Insert. The + // caller should hold a reference on the LRUHandle. When the caller releases + // the last reference, the item is added to the LRU list. + // The item is promoted to the high pri or low pri pool as specified by the + // caller in Lookup. + void Promote(LRUHandle* e); void LRU_Remove(LRUHandle* e); void LRU_Insert(LRUHandle* e); @@ -303,6 +417,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { // We don't count mutex_ as the cache's internal state so semantically we // don't mind mutex_ invoking the non-const actions. mutable port::Mutex mutex_; + + std::shared_ptr secondary_cache_; }; class LRUCache @@ -316,24 +432,28 @@ class LRUCache std::shared_ptr memory_allocator = nullptr, bool use_adaptive_mutex = kDefaultToAdaptiveMutex, CacheMetadataChargePolicy metadata_charge_policy = - kDontChargeCacheMetadata); + kDontChargeCacheMetadata, + const std::shared_ptr& secondary_cache = nullptr); virtual ~LRUCache(); virtual const char* Name() const override { return "LRUCache"; } - virtual CacheShard* GetShard(int shard) override; - virtual const CacheShard* GetShard(int shard) const override; + virtual CacheShard* GetShard(uint32_t shard) override; + virtual const CacheShard* GetShard(uint32_t shard) const override; virtual void* Value(Handle* handle) override; virtual size_t GetCharge(Handle* handle) const override; virtual uint32_t GetHash(Handle* handle) const override; + virtual DeleterFn GetDeleter(Handle* handle) const override; virtual void DisownData() override; + virtual void WaitAll(std::vector& handles) override; // Retrieves number of elements in LRU, for unit test purpose only size_t TEST_GetLRUSize(); - // Retrives high pri pool ratio + // Retrieves high pri pool ratio double GetHighPriPoolRatio(); private: LRUCacheShard* shards_ = nullptr; int num_shards_ = 0; + std::shared_ptr secondary_cache_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 08c05024aef..d20fd246360 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -7,8 +7,18 @@ #include #include + +#include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" #include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/io_status.h" +#include "rocksdb/sst_file_manager.h" #include "test_util/testharness.h" +#include "util/coding.h" +#include "util/random.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -30,9 +40,10 @@ class LRUCacheTest : public testing::Test { DeleteCache(); cache_ = reinterpret_cast( port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); - new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/, - high_pri_pool_ratio, use_adaptive_mutex, - kDontChargeCacheMetadata); + new (cache_) LRUCacheShard( + capacity, false /*strict_capcity_limit*/, high_pri_pool_ratio, + use_adaptive_mutex, kDontChargeCacheMetadata, + 24 /*max_upper_hash_bits*/, nullptr /*secondary_cache*/); } void Insert(const std::string& key, @@ -191,6 +202,978 @@ TEST_F(LRUCacheTest, EntriesWithPriority) { ValidateLRUList({"e", "f", "g", "Z", "d"}, 2); } +class TestSecondaryCache : public SecondaryCache { + public: + // Specifies what action to take on a lookup for a particular key + enum ResultType { + SUCCESS, + // Fail lookup immediately + FAIL, + // Defer the result. It will returned after Wait/WaitAll is called + DEFER, + // Defer the result and eventually return failure + DEFER_AND_FAIL + }; + + using ResultMap = std::unordered_map; + + explicit TestSecondaryCache(size_t capacity) + : num_inserts_(0), num_lookups_(0), inject_failure_(false) { + cache_ = NewLRUCache(capacity, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + } + ~TestSecondaryCache() override { cache_.reset(); } + + std::string Name() override { return "TestSecondaryCache"; } + + void InjectFailure() { inject_failure_ = true; } + + void ResetInjectFailure() { inject_failure_ = false; } + + void SetDbSessionId(const std::string& db_session_id) { + db_session_id_ = db_session_id; + } + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) override { + if (inject_failure_) { + return Status::Corruption("Insertion Data Corrupted"); + } + assert(IsDbSessionIdAsKeyPrefix(key) == true); + size_t size; + char* buf; + Status s; + + num_inserts_++; + size = (*helper->size_cb)(value); + buf = new char[size + sizeof(uint64_t)]; + EncodeFixed64(buf, size); + s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t)); + if (!s.ok()) { + delete[] buf; + return s; + } + return cache_->Insert(key, buf, size, + [](const Slice& /*key*/, void* val) -> void { + delete[] static_cast(val); + }); + } + + std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, + bool /*wait*/) override { + std::string key_str = key.ToString(); + TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); + + std::unique_ptr secondary_handle; + ResultType type = ResultType::SUCCESS; + auto iter = result_map_.find(key.ToString()); + if (iter != result_map_.end()) { + type = iter->second; + } + if (type == ResultType::FAIL) { + return secondary_handle; + } + + Cache::Handle* handle = cache_->Lookup(key); + num_lookups_++; + if (handle) { + void* value = nullptr; + size_t charge = 0; + Status s; + if (type != ResultType::DEFER_AND_FAIL) { + char* ptr = (char*)cache_->Value(handle); + size_t size = DecodeFixed64(ptr); + ptr += sizeof(uint64_t); + s = create_cb(ptr, size, &value, &charge); + } + if (s.ok()) { + secondary_handle.reset(new TestSecondaryCacheResultHandle( + cache_.get(), handle, value, charge, type)); + } else { + cache_->Release(handle); + } + } + return secondary_handle; + } + + void Erase(const Slice& /*key*/) override {} + + void WaitAll(std::vector handles) override { + for (SecondaryCacheResultHandle* handle : handles) { + TestSecondaryCacheResultHandle* sec_handle = + static_cast(handle); + sec_handle->SetReady(); + } + } + + std::string GetPrintableOptions() const override { return ""; } + + void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); } + + uint32_t num_inserts() { return num_inserts_; } + + uint32_t num_lookups() { return num_lookups_; } + + bool IsDbSessionIdAsKeyPrefix(const Slice& key) { + if (db_session_id_.size() == 0) { + return true; + } + if (key.size() < 20) { + return false; + } + std::string s_key = key.ToString(); + if (s_key.substr(0, 20) != db_session_id_) { + return false; + } + return true; + } + + private: + class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle { + public: + TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle, + void* value, size_t size, ResultType type) + : cache_(cache), + handle_(handle), + value_(value), + size_(size), + is_ready_(true) { + if (type != ResultType::SUCCESS) { + is_ready_ = false; + } + } + + ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); } + + bool IsReady() override { return is_ready_; } + + void Wait() override {} + + void* Value() override { + assert(is_ready_); + return value_; + } + + size_t Size() override { return Value() ? size_ : 0; } + + void SetReady() { is_ready_ = true; } + + private: + Cache* cache_; + Cache::Handle* handle_; + void* value_; + size_t size_; + bool is_ready_; + }; + + std::shared_ptr cache_; + uint32_t num_inserts_; + uint32_t num_lookups_; + bool inject_failure_; + std::string db_session_id_; + ResultMap result_map_; +}; + +class DBSecondaryCacheTest : public DBTestBase { + public: + DBSecondaryCacheTest() + : DBTestBase("/db_secondary_cache_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } + + std::shared_ptr fault_fs_; + std::unique_ptr fault_env_; +}; + +class LRUSecondaryCacheTest : public LRUCacheTest { + public: + LRUSecondaryCacheTest() : fail_create_(false) {} + ~LRUSecondaryCacheTest() {} + + protected: + class TestItem { + public: + TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { + memcpy(buf_.get(), buf, size); + } + ~TestItem() {} + + char* Buf() { return buf_.get(); } + size_t Size() { return size_; } + std::string ToString() { return std::string(Buf(), Size()); } + + private: + std::unique_ptr buf_; + size_t size_; + }; + + static size_t SizeCallback(void* obj) { + return reinterpret_cast(obj)->Size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + TestItem* item = reinterpret_cast(from_obj); + char* buf = item->Buf(); + EXPECT_EQ(length, item->Size()); + EXPECT_EQ(from_offset, 0); + memcpy(out, buf, length); + return Status::OK(); + } + + static void DeletionCallback(const Slice& /*key*/, void* obj) { + delete reinterpret_cast(obj); + } + + static Cache::CacheItemHelper helper_; + + static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/, + size_t /*size*/, void* /*out*/) { + return Status::NotSupported(); + } + + static Cache::CacheItemHelper helper_fail_; + + Cache::CreateCallback test_item_creator = + [&](void* buf, size_t size, void** out_obj, size_t* charge) -> Status { + if (fail_create_) { + return Status::NotSupported(); + } + *out_obj = reinterpret_cast(new TestItem((char*)buf, size)); + *charge = size; + return Status::OK(); + }; + + void SetFailCreate(bool fail) { fail_create_ = fail; } + + private: + bool fail_create_; +}; + +Cache::CacheItemHelper LRUSecondaryCacheTest::helper_( + LRUSecondaryCacheTest::SizeCallback, LRUSecondaryCacheTest::SaveToCallback, + LRUSecondaryCacheTest::DeletionCallback); + +Cache::CacheItemHelper LRUSecondaryCacheTest::helper_fail_( + LRUSecondaryCacheTest::SizeCallback, + LRUSecondaryCacheTest::SaveToCallbackFail, + LRUSecondaryCacheTest::DeletionCallback); + +TEST_F(LRUSecondaryCacheTest, BasicTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k2 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_, + str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should promote k1 and demote k2 + handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, BasicFailTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_NOK(cache->Insert("k1", item1, nullptr, str1.length())); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", nullptr, test_item_creator, Cache::Priority::LOW, + true); + ASSERT_EQ(handle, nullptr); + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false); + ASSERT_EQ(handle, nullptr); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, SaveFailTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_fail_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_fail_, + str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 demotion would have failed + handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, CreateFailTest) { + LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_, + str2.length())); + + Cache::Handle* handle; + SetFailCreate(true); + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + // This lookup should fail, since k1 creation would have failed + handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_EQ(handle, nullptr); + // Since k1 didn't get promoted, k2 should still be in cache + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_F(LRUSecondaryCacheTest, FullCapacityTest) { + LRUCacheOptions opts(1024, 0, /*_strict_capacity_limit=*/true, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(2048); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_, + str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_, + str2.length())); + + Cache::Handle* handle; + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + // k1 promotion should fail due to the block cache being at capacity, + // but the lookup should still succeed + Cache::Handle* handle2; + handle2 = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle2, nullptr); + // Since k1 didn't get inserted, k2 should still be in cache + cache->Release(handle); + cache->Release(handle2); + handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, true); + ASSERT_NE(handle, nullptr); + cache->Release(handle); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + cache.reset(); + secondary_cache.reset(); +} + +// In this test, the block cache size is set to 4096, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, in any situation, +// if we try to insert block_1 to the block cache, it will always fails. Only +// block_2 will be successfully inserted into the block cache. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + + // Set the file paranoid check, so after flush, the file will be read + // all the blocks will be accessed. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Note that, block_1 is never successfully + // inserted to the block cache. Here are 2 lookups in the secondary cache + // for block_1 and block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. Meta blocks are always cached. When block_1 is read + // out, block_2 is evicted from block cache and inserted to secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // The first data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_1. But block_1 will not + // be inserted successfully due to the size. Currently, cache only has + // the meta blocks. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // The second data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_2 and block_2 is found + // in the secondary cache. Now block cache has block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // block_2 is in the block cache. There is a block cache hit. No need to + // lookup or insert the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 6u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 7u); + + Destroy(options); +} + +// In this test, the block cache size is set to 6100, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, we can successfully +// insert and cache block_1 in the block cache (this is the different place +// from TestSecondaryCacheCorrectness1) +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { + LRUCacheOptions opts(6100, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.paranoid_file_checks = true; + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Thefore, block_1 is evicted from block + // cache and successfully inserted to the secondary cache. Here are 2 + // lookups in the secondary cache for block_1 and block_2. + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. After Flush, only block_2 is cached in block cache + // and block_1 is in the secondary cache. So when read block_1, it is + // read out from secondary cache and inserted to block cache. At the same + // time, block_2 is inserted to secondary cache. Now, secondary cache has + // both block_1 and block_2. After compaction, block_1 is in the cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is cached in block cache + // there is no secondary cache lookup. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_2 which is not in the block cache. So + // it will lookup the secondary cache for block_2 and cache it in the + // block_cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_2 which is already in the block cache. + // No need to lookup secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is not in block cache + // there is one econdary cache lookup. Then, block_1 is cached in the + // block cache. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // This Get needs to access block_1, since block_1 is cached in block cache + // there is no secondary cache lookup. + ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + Destroy(options); +} + +// The block cache size is set to 1024*1024, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, we can successfully +// cache all the blocks in the block cache and there is not secondary cache +// insertion. 2 lookup is needed for the blocks. +TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { + LRUCacheOptions opts(1024 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.paranoid_file_checks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1000); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. Now, block cache is large enough, it cache + // both block_1 and block_2. When first time read block_1 and block_2 + // there are cache misses. So 2 secondary cache lookups are needed for + // the 2 blocks + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Compact("a", "z"); + // Compaction will iterate the whole SST file. Since all the data blocks + // are in the block cache. No need to lookup the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1000, v.size()); + // Since the block cache is large enough, all the blocks are cached. we + // do not need to lookup the seondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + Destroy(options); +} + +TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { + LRUCacheOptions opts(8 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1000); + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + Compact("a", "z"); + + Random r_index(47); + std::string v; + for (int i = 0; i < 1000; i++) { + uint32_t key_i = r_index.Next() % N; + v = Get(Key(key_i)); + } + + // We have over 200 data blocks there will be multiple insertion + // and lookups. + ASSERT_GE(secondary_cache->num_inserts(), 1u); + ASSERT_GE(secondary_cache->num_lookups(), 1u); + + Destroy(options); +} + +// In this test, the block cache size is set to 4096, after insert 6 KV-pairs +// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta +// blocks. block_1 size is 4096 and block_2 size is 2056. The total size +// of the meta blocks are about 900 to 1000. Therefore, in any situation, +// if we try to insert block_1 to the block cache, it will always fails. Only +// block_2 will be successfully inserted into the block cache. +TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { + LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr, + kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.paranoid_file_checks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.env = fault_env_.get(); + fault_fs_->SetFailGetUniqueId(true); + DestroyAndReopen(options); + std::string session_id; + ASSERT_OK(db_->GetDbSessionId(session_id)); + secondary_cache->SetDbSessionId(session_id); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(1007); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB do the paranoid check for the new + // SST file. Meta blocks are always cached in the block cache and they + // will not be evicted. When block_2 is cache miss and read out, it is + // inserted to the block cache. Note that, block_1 is never successfully + // inserted to the block cache. Here are 2 lookups in the secondary cache + // for block_1 and block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); + + // Fail the insertion, in LRU cache, the secondary insertion returned status + // is not checked, therefore, the DB will not be influenced. + secondary_cache->InjectFailure(); + Compact("a", "z"); + // Compaction will create the iterator to scan the whole file. So all the + // blocks are needed. Meta blocks are always cached. When block_1 is read + // out, block_2 is evicted from block cache and inserted to secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // The first data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_1. But block_1 will not + // be inserted successfully due to the size. Currently, cache only has + // the meta blocks. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // The second data block is not in the cache, similarly, trigger the block + // cache Lookup and secondary cache lookup for block_2 and block_2 is found + // in the secondary cache. Now block cache has block_2 + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + // block_2 is in the block cache. There is a block cache hit. No need to + // lookup or insert the secondary cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 5u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 6u); + + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + // Lookup the first data block, not in the block cache, so lookup the + // secondary cache. Also not in the secondary cache. After Get, still + // block_1 is will not be cached. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 7u); + secondary_cache->ResetInjectFailure(); + + Destroy(options); +} + +TEST_F(LRUSecondaryCacheTest, BasicWaitAllTest) { + LRUCacheOptions opts(1024, 2, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache = + std::make_shared(32 * 1024); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + const int num_keys = 32; + + Random rnd(301); + std::vector values; + for (int i = 0; i < num_keys; ++i) { + std::string str = rnd.RandomString(1020); + values.emplace_back(str); + TestItem* item = new TestItem(str.data(), str.length()); + ASSERT_OK(cache->Insert("k" + std::to_string(i), item, + &LRUSecondaryCacheTest::helper_, str.length())); + } + // Force all entries to be evicted to the secondary cache + cache->SetCapacity(0); + ASSERT_EQ(secondary_cache->num_inserts(), 32u); + cache->SetCapacity(32 * 1024); + + secondary_cache->SetResultMap( + {{"k3", TestSecondaryCache::ResultType::DEFER}, + {"k4", TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {"k5", TestSecondaryCache::ResultType::FAIL}}); + std::vector results; + for (int i = 0; i < 6; ++i) { + results.emplace_back( + cache->Lookup("k" + std::to_string(i), &LRUSecondaryCacheTest::helper_, + test_item_creator, Cache::Priority::LOW, false)); + } + cache->WaitAll(results); + for (int i = 0; i < 6; ++i) { + if (i == 4) { + ASSERT_EQ(cache->Value(results[i]), nullptr); + } else if (i == 5) { + ASSERT_EQ(results[i], nullptr); + continue; + } else { + TestItem* item = static_cast(cache->Value(results[i])); + ASSERT_EQ(item->ToString(), values[i]); + } + cache->Release(results[i]); + } + + cache.reset(); + secondary_cache.reset(); +} + +// In this test, we have one KV pair per data block. We indirectly determine +// the cache key associated with each data block (and thus each KV) by using +// a sync point callback in TestSecondaryCache::Lookup. We then control the +// lookup result by setting the ResultMap. +TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) { + LRUCacheOptions opts(1 << 20, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.paranoid_file_checks = true; + DestroyAndReopen(options); + Random rnd(301); + const int N = 8; + std::vector keys; + for (int i = 0; i < N; i++) { + std::string p_v = rnd.RandomString(4000); + keys.emplace_back(p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + // After Flush is successful, RocksDB does the paranoid check for the new + // SST file. This will try to lookup all data blocks in the secondary + // cache. + ASSERT_EQ(secondary_cache->num_inserts(), 0u); + ASSERT_EQ(secondary_cache->num_lookups(), 8u); + + cache->SetCapacity(0); + ASSERT_EQ(secondary_cache->num_inserts(), 8u); + cache->SetCapacity(1 << 20); + + std::vector cache_keys; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void { + cache_keys.emplace_back(*(static_cast(key))); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < N; ++i) { + std::string v = Get(Key(i)); + ASSERT_EQ(4000, v.size()); + ASSERT_EQ(v, keys[i]); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(secondary_cache->num_lookups(), 16u); + cache->SetCapacity(0); + cache->SetCapacity(1 << 20); + + ASSERT_EQ(Get(Key(2)), keys[2]); + ASSERT_EQ(Get(Key(7)), keys[7]); + secondary_cache->SetResultMap( + {{cache_keys[3], TestSecondaryCache::ResultType::DEFER}, + {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL}, + {cache_keys[5], TestSecondaryCache::ResultType::FAIL}}); + + std::vector mget_keys( + {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)}); + std::vector values(mget_keys.size()); + std::vector s(keys.size()); + std::vector key_slices; + for (const std::string& key : mget_keys) { + key_slices.emplace_back(key); + } + uint32_t num_lookups = secondary_cache->num_lookups(); + dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), + key_slices.size(), key_slices.data(), values.data(), + s.data(), false); + ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5); + for (int i = 0; i < N; ++i) { + ASSERT_OK(s[i]); + ASSERT_EQ(values[i].ToString(), keys[i]); + values[i].Reset(); + } + Destroy(options); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index 6c915df8cc8..bf90ea3b10f 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -9,53 +9,96 @@ #include "cache/sharded_cache.h" -#include +#include +#include +#include +#include "util/hash.h" +#include "util/math.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +namespace { + +inline uint32_t HashSlice(const Slice& s) { + return Lower32of64(GetSliceNPHash64(s)); +} + +} // namespace + ShardedCache::ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, std::shared_ptr allocator) : Cache(std::move(allocator)), - num_shard_bits_(num_shard_bits), + shard_mask_((uint32_t{1} << num_shard_bits) - 1), capacity_(capacity), strict_capacity_limit_(strict_capacity_limit), last_id_(1) {} void ShardedCache::SetCapacity(size_t capacity) { - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; MutexLock l(&capacity_mutex_); - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { GetShard(s)->SetCapacity(per_shard); } capacity_ = capacity; } void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) { - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); MutexLock l(&capacity_mutex_); - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit); } strict_capacity_limit_ = strict_capacity_limit; } Status ShardedCache::Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle, Priority priority) { + DeleterFn deleter, Handle** handle, + Priority priority) { uint32_t hash = HashSlice(key); return GetShard(Shard(hash)) ->Insert(key, hash, value, charge, deleter, handle, priority); } +Status ShardedCache::Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t charge, + Handle** handle, Priority priority) { + uint32_t hash = HashSlice(key); + if (!helper) { + return Status::InvalidArgument(); + } + return GetShard(Shard(hash)) + ->Insert(key, hash, value, helper, charge, handle, priority); +} + Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) { uint32_t hash = HashSlice(key); return GetShard(Shard(hash))->Lookup(key, hash); } +Cache::Handle* ShardedCache::Lookup(const Slice& key, + const CacheItemHelper* helper, + const CreateCallback& create_cb, + Priority priority, bool wait, + Statistics* /*stats*/) { + uint32_t hash = HashSlice(key); + return GetShard(Shard(hash)) + ->Lookup(key, hash, helper, create_cb, priority, wait); +} + +bool ShardedCache::IsReady(Handle* handle) { + uint32_t hash = GetHash(handle); + return GetShard(Shard(hash))->IsReady(handle); +} + +void ShardedCache::Wait(Handle* handle) { + uint32_t hash = GetHash(handle); + GetShard(Shard(hash))->Wait(handle); +} + bool ShardedCache::Ref(Handle* handle) { uint32_t hash = GetHash(handle); return GetShard(Shard(hash))->Ref(handle); @@ -66,6 +109,11 @@ bool ShardedCache::Release(Handle* handle, bool force_erase) { return GetShard(Shard(hash))->Release(handle, force_erase); } +bool ShardedCache::Release(Handle* handle, bool useful, bool force_erase) { + uint32_t hash = GetHash(handle); + return GetShard(Shard(hash))->Release(handle, useful, force_erase); +} + void ShardedCache::Erase(const Slice& key) { uint32_t hash = HashSlice(key); GetShard(Shard(hash))->Erase(key, hash); @@ -87,9 +135,9 @@ bool ShardedCache::HasStrictCapacityLimit() const { size_t ShardedCache::GetUsage() const { // We will not lock the cache when getting the usage from shards. - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); size_t usage = 0; - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { usage += GetShard(s)->GetUsage(); } return usage; @@ -101,25 +149,42 @@ size_t ShardedCache::GetUsage(Handle* handle) const { size_t ShardedCache::GetPinnedUsage() const { // We will not lock the cache when getting the usage from shards. - int num_shards = 1 << num_shard_bits_; + uint32_t num_shards = GetNumShards(); size_t usage = 0; - for (int s = 0; s < num_shards; s++) { + for (uint32_t s = 0; s < num_shards; s++) { usage += GetShard(s)->GetPinnedUsage(); } return usage; } -void ShardedCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) { - int num_shards = 1 << num_shard_bits_; - for (int s = 0; s < num_shards; s++) { - GetShard(s)->ApplyToAllCacheEntries(callback, thread_safe); - } +void ShardedCache::ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) { + uint32_t num_shards = GetNumShards(); + // Iterate over part of each shard, rotating between shards, to + // minimize impact on latency of concurrent operations. + std::unique_ptr states(new uint32_t[num_shards]{}); + + uint32_t aepl_in_32 = static_cast( + std::min(size_t{UINT32_MAX}, opts.average_entries_per_lock)); + aepl_in_32 = std::min(aepl_in_32, uint32_t{1}); + + bool remaining_work; + do { + remaining_work = false; + for (uint32_t s = 0; s < num_shards; s++) { + if (states[s] != UINT32_MAX) { + GetShard(s)->ApplyToSomeEntries(callback, aepl_in_32, &states[s]); + remaining_work |= states[s] != UINT32_MAX; + } + } + } while (remaining_work); } void ShardedCache::EraseUnRefEntries() { - int num_shards = 1 << num_shard_bits_; - for (int s = 0; s < num_shards; s++) { + uint32_t num_shards = GetNumShards(); + for (uint32_t s = 0; s < num_shards; s++) { GetShard(s)->EraseUnRefEntries(); } } @@ -134,7 +199,8 @@ std::string ShardedCache::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " capacity : %" ROCKSDB_PRIszt "\n", capacity_); ret.append(buffer); - snprintf(buffer, kBufferSize, " num_shard_bits : %d\n", num_shard_bits_); + snprintf(buffer, kBufferSize, " num_shard_bits : %d\n", + GetNumShardBits()); ret.append(buffer); snprintf(buffer, kBufferSize, " strict_capacity_limit : %d\n", strict_capacity_limit_); @@ -159,4 +225,8 @@ int GetDefaultCacheShardBits(size_t capacity) { return num_shard_bits; } +int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); } + +uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; } + } // namespace ROCKSDB_NAMESPACE diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index ce9e459dc14..3e2a20abac4 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -14,7 +14,6 @@ #include "port/port.h" #include "rocksdb/cache.h" -#include "util/hash.h" namespace ROCKSDB_NAMESPACE { @@ -24,20 +23,37 @@ class CacheShard { CacheShard() = default; virtual ~CacheShard() = default; + using DeleterFn = Cache::DeleterFn; virtual Status Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, - void (*deleter)(const Slice& key, void* value), + size_t charge, DeleterFn deleter, + Cache::Handle** handle, Cache::Priority priority) = 0; + virtual Status Insert(const Slice& key, uint32_t hash, void* value, + const Cache::CacheItemHelper* helper, size_t charge, Cache::Handle** handle, Cache::Priority priority) = 0; virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0; + virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash, + const Cache::CacheItemHelper* helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority, bool wait) = 0; + virtual bool Release(Cache::Handle* handle, bool useful, + bool force_erase) = 0; + virtual bool IsReady(Cache::Handle* handle) = 0; + virtual void Wait(Cache::Handle* handle) = 0; virtual bool Ref(Cache::Handle* handle) = 0; - virtual bool Release(Cache::Handle* handle, bool force_erase = false) = 0; + virtual bool Release(Cache::Handle* handle, bool force_erase) = 0; virtual void Erase(const Slice& key, uint32_t hash) = 0; virtual void SetCapacity(size_t capacity) = 0; virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; virtual size_t GetUsage() const = 0; virtual size_t GetPinnedUsage() const = 0; - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) = 0; + // Handles iterating over roughly `average_entries_per_lock` entries, using + // `state` to somehow record where it last ended up. Caller initially uses + // *state == 0 and implementation sets *state = UINT32_MAX to indicate + // completion. + virtual void ApplyToSomeEntries( + const std::function& callback, + uint32_t average_entries_per_lock, uint32_t* state) = 0; virtual void EraseUnRefEntries() = 0; virtual std::string GetPrintableOptions() const { return ""; } void set_metadata_charge_policy( @@ -57,22 +73,29 @@ class ShardedCache : public Cache { ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, std::shared_ptr memory_allocator = nullptr); virtual ~ShardedCache() = default; - virtual const char* Name() const override = 0; - virtual CacheShard* GetShard(int shard) = 0; - virtual const CacheShard* GetShard(int shard) const = 0; - virtual void* Value(Handle* handle) override = 0; - virtual size_t GetCharge(Handle* handle) const override = 0; + virtual CacheShard* GetShard(uint32_t shard) = 0; + virtual const CacheShard* GetShard(uint32_t shard) const = 0; virtual uint32_t GetHash(Handle* handle) const = 0; - virtual void DisownData() override = 0; virtual void SetCapacity(size_t capacity) override; virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override; virtual Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle, Priority priority) override; + DeleterFn deleter, Handle** handle, + Priority priority) override; + virtual Status Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t chargge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) override; virtual Handle* Lookup(const Slice& key, Statistics* stats) override; + virtual Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + const CreateCallback& create_cb, Priority priority, + bool wait, Statistics* stats = nullptr) override; + virtual bool Release(Handle* handle, bool useful, + bool force_erase = false) override; + virtual bool IsReady(Handle* handle) override; + virtual void Wait(Handle* handle) override; virtual bool Ref(Handle* handle) override; virtual bool Release(Handle* handle, bool force_erase = false) override; virtual void Erase(const Slice& key) override; @@ -82,24 +105,21 @@ class ShardedCache : public Cache { virtual size_t GetUsage() const override; virtual size_t GetUsage(Handle* handle) const override; virtual size_t GetPinnedUsage() const override; - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override; + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override; virtual void EraseUnRefEntries() override; virtual std::string GetPrintableOptions() const override; - int GetNumShardBits() const { return num_shard_bits_; } - - private: - static inline uint32_t HashSlice(const Slice& s) { - return static_cast(GetSliceNPHash64(s)); - } + int GetNumShardBits() const; + uint32_t GetNumShards() const; - uint32_t Shard(uint32_t hash) { - // Note, hash >> 32 yields hash in gcc, not the zero we expect! - return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0; - } + protected: + inline uint32_t Shard(uint32_t hash) { return hash & shard_mask_; } - int num_shard_bits_; + private: + const uint32_t shard_mask_; mutable port::Mutex capacity_mutex_; size_t capacity_; bool strict_capacity_limit_; diff --git a/cloud/aws/aws_kafka.cc b/cloud/aws/aws_kafka.cc index 74e4a1b5f6f..3c4a9b31e4e 100644 --- a/cloud/aws/aws_kafka.cc +++ b/cloud/aws/aws_kafka.cc @@ -41,6 +41,7 @@ class KafkaWritableFile : public CloudLogWritableFile { } ~KafkaWritableFile() {} + using CloudLogWritableFile::Append; virtual Status Append(const Slice& data); virtual Status Close(); virtual bool IsSyncThreadSafe() const; diff --git a/cloud/aws/aws_kinesis.cc b/cloud/aws/aws_kinesis.cc index 30c93531917..33a724a2666 100644 --- a/cloud/aws/aws_kinesis.cc +++ b/cloud/aws/aws_kinesis.cc @@ -55,6 +55,7 @@ class KinesisWritableFile : public CloudLogWritableFile { } virtual ~KinesisWritableFile() {} + using CloudLogWritableFile::Append; virtual Status Append(const Slice& data) override; virtual Status Close() override; virtual Status LogDelete() override; diff --git a/cloud/cloud_env.cc b/cloud/cloud_env.cc index 594d4c6e18d..633c8728fab 100644 --- a/cloud/cloud_env.cc +++ b/cloud/cloud_env.cc @@ -122,92 +122,92 @@ static std::unordered_map {0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto bucket = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto bucket = static_cast(addr); bucket->SetObjectPath(value); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr, std::string* value) { - auto bucket = reinterpret_cast(addr); + const void* addr, std::string* value) { + auto bucket = static_cast(addr); *value = bucket->GetObjectPath(); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr1, const char* addr2, std::string* /*mismatch*/) { - auto bucket1 = reinterpret_cast(addr1); - auto bucket2 = reinterpret_cast(addr2); + const void* addr1, const void* addr2, std::string* /*mismatch*/) { + auto bucket1 = static_cast(addr1); + auto bucket2 = static_cast(addr2); return bucket1->GetObjectPath() == bucket2->GetObjectPath(); }}}, {"region", {0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto bucket = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto bucket = static_cast(addr); bucket->SetRegion(value); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr, std::string* value) { - auto bucket = reinterpret_cast(addr); + const void* addr, std::string* value) { + auto bucket = static_cast(addr); *value = bucket->GetRegion(); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr1, const char* addr2, std::string* /*mismatch*/) { - auto bucket1 = reinterpret_cast(addr1); - auto bucket2 = reinterpret_cast(addr2); + const void* addr1, const void* addr2, std::string* /*mismatch*/) { + auto bucket1 = static_cast(addr1); + auto bucket2 = static_cast(addr2); return bucket1->GetRegion() == bucket2->GetRegion(); }}}, {"prefix", {0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto bucket = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto bucket = static_cast(addr); bucket->SetBucketName(bucket->GetBucketName(false), value); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr, std::string* value) { - auto bucket = reinterpret_cast(addr); + const void* addr, std::string* value) { + auto bucket = static_cast(addr); *value = bucket->GetBucketPrefix(); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr1, const char* addr2, std::string* /*mismatch*/) { - auto bucket1 = reinterpret_cast(addr1); - auto bucket2 = reinterpret_cast(addr2); + const void* addr1, const void* addr2, std::string* /*mismatch*/) { + auto bucket1 = static_cast(addr1); + auto bucket2 = static_cast(addr2); return bucket1->GetBucketPrefix() == bucket2->GetBucketPrefix(); }}}, {"bucket", {0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto bucket = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto bucket = static_cast(addr); bucket->SetBucketName(value); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr, std::string* value) { - auto bucket = reinterpret_cast(addr); + const void* addr, std::string* value) { + auto bucket = static_cast(addr); *value = bucket->GetBucketName(false); return Status::OK(); }, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr1, const char* addr2, std::string* /*mismatch*/) { - auto bucket1 = reinterpret_cast(addr1); - auto bucket2 = reinterpret_cast(addr2); + const void* addr1, const void* addr2, std::string* /*mismatch*/) { + auto bucket1 = static_cast(addr1); + auto bucket2 = static_cast(addr2); return bucket1->GetBucketName(false) == bucket2->GetBucketName(false); }}}, {"TEST", {0, OptionType::kUnknown, OptionVerificationType::kAlias, OptionTypeFlags::kNone, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto bucket = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto bucket = static_cast(addr); std::string name = value; std::string path; std::string region; @@ -255,9 +255,6 @@ static std::unordered_map {"skip_cloud_children_files", {offset_of(&CloudEnvOptions::skip_cloud_files_in_getchildren), OptionType::kBoolean}}, - {"use_direct_io_for_cloud_download", - {offset_of(&CloudEnvOptions::use_direct_io_for_cloud_download), - OptionType::kBoolean}}, {"constant_sst_file_size_in_manager", {offset_of( &CloudEnvOptions::constant_sst_file_size_in_sst_file_manager), @@ -274,9 +271,9 @@ static std::unordered_map (OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose | OptionTypeFlags::kCompareNever | OptionTypeFlags::kAllowNull), [](const ConfigOptions& opts, const std::string& /*name*/, - const std::string& value, char* addr) { + const std::string& value, void* addr) { auto provider = - reinterpret_cast*>(addr); + static_cast*>(addr); return CloudStorageProvider::CreateFromString(opts, value, provider); }}}, @@ -287,9 +284,9 @@ static std::unordered_map OptionTypeFlags::kCompareNever | OptionTypeFlags::kAllowNull), // Creates a new TableFactory based on value [](const ConfigOptions& opts, const std::string& /*name*/, - const std::string& value, char* addr) { + const std::string& value, void* addr) { auto controller = - reinterpret_cast*>(addr); + static_cast*>(addr); Status s = CloudLogController::CreateFromString(opts, value, controller); return s; @@ -306,8 +303,8 @@ static std::unordered_map {0, OptionType::kUnknown, OptionVerificationType::kAlias, OptionTypeFlags::kNone, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto copts = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto copts = static_cast(addr); std::string name; std::string path; std::string region; @@ -359,8 +356,7 @@ Status CloudEnvOptions::Serialize(const ConfigOptions& config_options, std::stri CloudEnv::CloudEnv(const CloudEnvOptions& options, Env* base, const std::shared_ptr& logger) : cloud_env_options(options), base_env_(base), info_log_(logger) { - ConfigurableHelper::RegisterOptions(*this, &cloud_env_options, - &cloud_env_option_type_info); + RegisterOptions(&cloud_env_options, &cloud_env_option_type_info); } CloudEnv::~CloudEnv() { diff --git a/cloud/cloud_env_impl.cc b/cloud/cloud_env_impl.cc index dc47c6f3f83..8efdd4b3d8a 100644 --- a/cloud/cloud_env_impl.cc +++ b/cloud/cloud_env_impl.cc @@ -859,17 +859,15 @@ Status CloudEnvImpl::LoadLocalCloudManifest(const std::string& dbname) { Status CloudEnvImpl::LoadLocalCloudManifest( const std::string& dbname, Env* base_env, std::unique_ptr* cloud_manifest) { - std::unique_ptr file; + std::unique_ptr reader; auto cloud_manifest_file_name = CloudManifestFile(dbname); - auto s = base_env->NewSequentialFile(cloud_manifest_file_name, &file, - EnvOptions()); + auto s = SequentialFileReader::Create(base_env->GetFileSystem(), + cloud_manifest_file_name, FileOptions(), + &reader, nullptr); if (!s.ok()) { return s; } - return CloudManifest::LoadFromLog( - std::unique_ptr(new SequentialFileReader( - NewLegacySequentialFileWrapper(file), cloud_manifest_file_name)), - cloud_manifest); + return CloudManifest::LoadFromLog(std::move(reader), cloud_manifest); } std::string CloudEnvImpl::RemapFilename(const std::string& logical_path) const { @@ -1005,12 +1003,11 @@ Status CloudEnvImpl::writeCloudManifest(CloudManifest* manifest, // Write to tmp file and atomically rename later. This helps if we crash // mid-write :) auto tmp_fname = fname + ".tmp"; - std::unique_ptr file; - Status s = local_env->NewWritableFile(tmp_fname, &file, EnvOptions()); + std::unique_ptr writer; + Status s = WritableFileWriter::Create(local_env->GetFileSystem(), tmp_fname, + FileOptions(), &writer, nullptr); if (s.ok()) { - s = manifest->WriteToLog(std::unique_ptr( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)), - tmp_fname, EnvOptions()))); + s = manifest->WriteToLog(std::move(writer)); } if (s.ok()) { s = local_env->RenameFile(tmp_fname, fname); @@ -1788,8 +1785,8 @@ Status CloudEnvImpl::RollNewEpoch(const std::string& local_dbname) { // However, we don't move here, we copy. If we moved and crashed immediately // after (before writing CLOUDMANIFEST), we'd corrupt our database. The old // MANIFEST file will be cleaned up in DeleteInvisibleFiles(). - LegacyFileSystemWrapper fs(GetBaseEnv()); - st = CopyFile(&fs, ManifestFileWithEpoch(local_dbname, oldEpoch), + const auto& fs = GetBaseEnv()->GetFileSystem(); + st = CopyFile(fs.get(), ManifestFileWithEpoch(local_dbname, oldEpoch), ManifestFileWithEpoch(local_dbname, newEpoch), 0, true); if (!st.ok()) { return st; diff --git a/cloud/cloud_manifest_test.cc b/cloud/cloud_manifest_test.cc index 177064371c4..b38936a0f2f 100644 --- a/cloud/cloud_manifest_test.cc +++ b/cloud/cloud_manifest_test.cc @@ -50,22 +50,18 @@ TEST_F(CloudManifestTest, BasicTest) { // serialize and deserialize auto tmpfile = tmp_dir_ + "/cloudmanifest"; { - std::unique_ptr file; - ASSERT_OK(env_->NewWritableFile(tmpfile, &file, EnvOptions())); - ASSERT_OK(manifest->WriteToLog( - std::unique_ptr(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), tmpfile, - EnvOptions())))); + std::unique_ptr writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), tmpfile, + FileOptions(), &writer, nullptr)); + ASSERT_OK(manifest->WriteToLog(std::move(writer))); } manifest.reset(); { - std::unique_ptr file; - ASSERT_OK(env_->NewSequentialFile(tmpfile, &file, EnvOptions())); - CloudManifest::LoadFromLog( - std::unique_ptr(new SequentialFileReader( - NewLegacySequentialFileWrapper(file), tmpfile)), - &manifest); + std::unique_ptr reader; + ASSERT_OK(SequentialFileReader::Create( + env_->GetFileSystem(), tmpfile, FileOptions(), &reader, nullptr)); + ASSERT_OK(CloudManifest::LoadFromLog(std::move(reader), &manifest)); } } } diff --git a/cloud/cloud_storage_provider_impl.h b/cloud/cloud_storage_provider_impl.h index 60f0974ca48..63c8587d6f2 100644 --- a/cloud/cloud_storage_provider_impl.h +++ b/cloud/cloud_storage_provider_impl.h @@ -51,12 +51,14 @@ class CloudStorageWritableFileImpl : public CloudStorageWritableFile { const EnvOptions& options); virtual ~CloudStorageWritableFileImpl(); + using CloudStorageWritableFile::Append; virtual Status Append(const Slice& data) override { assert(status_.ok()); // write to temporary file return local_file_->Append(data); } + using CloudStorageWritableFile::PositionedAppend; Status PositionedAppend(const Slice& data, uint64_t offset) override { return local_file_->PositionedAppend(data, offset); } diff --git a/cloud/db_cloud_impl.cc b/cloud/db_cloud_impl.cc index 6fd7afef933..c91938a16d8 100644 --- a/cloud/db_cloud_impl.cc +++ b/cloud/db_cloud_impl.cc @@ -29,21 +29,22 @@ namespace { */ class ConstantSizeSstFileManager : public SstFileManagerImpl { public: - ConstantSizeSstFileManager(int64_t constant_file_size, Env* env, + ConstantSizeSstFileManager(int64_t constant_file_size, + const std::shared_ptr& clock, + const std::shared_ptr& fs, std::shared_ptr logger, int64_t rate_bytes_per_sec, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) - : SstFileManagerImpl(env, std::make_shared(env), - std::move(logger), rate_bytes_per_sec, + : SstFileManagerImpl(clock, fs, std::move(logger), rate_bytes_per_sec, max_trash_db_ratio, bytes_max_delete_chunk), constant_file_size_(constant_file_size) { assert(constant_file_size_ >= 0); } - Status OnAddFile(const std::string& file_path, bool compaction) override { + Status OnAddFile(const std::string& file_path) override { return SstFileManagerImpl::OnAddFile( - file_path, uint64_t(constant_file_size_), compaction); + file_path, uint64_t(constant_file_size_)); } private: @@ -107,7 +108,8 @@ Status DBCloud::Open(const Options& opt, const std::string& local_dbname, // If users don't use Options.sst_file_manager, then these values are used // currently when creating an SST File Manager. options.sst_file_manager = std::make_shared( - constant_sst_file_size, options.env, options.info_log, + constant_sst_file_size, options.env->GetSystemClock(), + options.env->GetFileSystem(), options.info_log, 0 /* rate_bytes_per_sec */, 0.25 /* max_trash_db_ratio */, 64 * 1024 * 1024 /* bytes_max_delete_chunk */); } @@ -318,9 +320,9 @@ Status DBCloudImpl::DoCheckpointToCloud( auto current_epoch = cenv->GetCloudManifest()->GetCurrentEpoch().ToString(); auto manifest_fname = ManifestFileWithEpoch("", current_epoch); auto tmp_manifest_fname = manifest_fname + ".tmp"; - LegacyFileSystemWrapper fs(base_env); + auto fs = base_env->GetFileSystem(); st = - CopyFile(&fs, GetName() + "/" + manifest_fname, + CopyFile(fs.get(), GetName() + "/" + manifest_fname, GetName() + "/" + tmp_manifest_fname, manifest_file_size, false); if (!st.ok()) { return st; diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index a1a1be3a264..520588afe89 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -30,20 +30,18 @@ Status ArenaWrappedDBIter::GetProperty(std::string prop_name, return db_iter_->GetProperty(prop_name, prop); } -void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iteration, - uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob, - bool allow_refresh) { +void ArenaWrappedDBIter::Init( + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options, - cf_options.user_comparator, nullptr, sequence, - true, max_sequential_skip_in_iteration, - read_callback, db_impl, cfd, allow_blob); + db_iter_ = + new (mem) DBIter(env, read_options, ioptions, mutable_cf_options, + ioptions.user_comparator, /* iter */ nullptr, version, + sequence, true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, expose_blob_index); sv_number_ = version_number; read_options_ = read_options; allow_refresh_ = allow_refresh; @@ -72,8 +70,9 @@ Status ArenaWrappedDBIter::Refresh() { read_callback_->Refresh(latest_seq); } Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, - latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_, + sv->current, latest_seq, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_, allow_refresh_); InternalIterator* internal_iter = db_impl_->NewInternalIterator( @@ -88,18 +87,17 @@ Status ArenaWrappedDBIter::Refresh() { } ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob, bool allow_refresh) { + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); - iter->Init(env, read_options, cf_options, mutable_cf_options, sequence, + iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence, max_sequential_skip_in_iterations, version_number, read_callback, - db_impl, cfd, allow_blob, allow_refresh); + db_impl, cfd, expose_blob_index, allow_refresh); if (db_impl != nullptr && cfd != nullptr && allow_refresh) { - iter->StoreRefreshInfo(db_impl, cfd, read_callback, allow_blob); + iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index); } return iter; diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h index 80422f63a02..17273b201d9 100644 --- a/db/arena_wrapped_db_iter.h +++ b/db/arena_wrapped_db_iter.h @@ -23,6 +23,7 @@ namespace ROCKSDB_NAMESPACE { class Arena; +class Version; // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed to be allocated. This class is used as an entry point of @@ -71,21 +72,21 @@ class ArenaWrappedDBIter : public Iterator { Status Refresh() override; void Init(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, + const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob, bool allow_refresh); + bool expose_blob_index, bool allow_refresh); // Store some parameters so we can refresh the iterator at a later point // with these same params void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd, - ReadCallback* read_callback, bool allow_blob) { + ReadCallback* read_callback, bool expose_blob_index) { db_impl_ = db_impl; cfd_ = cfd; read_callback_ = read_callback; - allow_blob_ = allow_blob; + expose_blob_index_ = expose_blob_index; } private: @@ -96,7 +97,7 @@ class ArenaWrappedDBIter : public Iterator { DBImpl* db_impl_ = nullptr; ReadOptions read_options_; ReadCallback* read_callback_; - bool allow_blob_ = false; + bool expose_blob_index_ = false; bool allow_refresh_ = true; }; @@ -104,11 +105,10 @@ class ArenaWrappedDBIter : public Iterator { // `db_impl` and `cfd` are used for reneweal. If left null, renewal will not // be supported. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl = nullptr, - ColumnFamilyData* cfd = nullptr, bool allow_blob = false, - bool allow_refresh = true); + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, const Version* version, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false, bool allow_refresh = true); } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_fetcher.cc b/db/blob/blob_fetcher.cc new file mode 100644 index 00000000000..a42a4be5f39 --- /dev/null +++ b/db/blob/blob_fetcher.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_fetcher.h" + +#include "db/version_set.h" + +namespace ROCKSDB_NAMESPACE { + +Status BlobFetcher::FetchBlob(const Slice& user_key, const Slice& blob_index, + PinnableSlice* blob_value) { + Status s; + assert(version_); + constexpr uint64_t* bytes_read = nullptr; + s = version_->GetBlob(read_options_, user_key, blob_index, blob_value, + bytes_read); + return s; +} + +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/blob/blob_fetcher.h b/db/blob/blob_fetcher.h new file mode 100644 index 00000000000..747057f0999 --- /dev/null +++ b/db/blob/blob_fetcher.h @@ -0,0 +1,26 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +class Version; + +class BlobFetcher { + public: + BlobFetcher(Version* version, const ReadOptions& read_options) + : version_(version), read_options_(read_options) {} + + Status FetchBlob(const Slice& user_key, const Slice& blob_index, + PinnableSlice* blob_value); + + private: + Version* version_; + ReadOptions read_options_; +}; +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index 57f05438c47..674466c7128 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -8,6 +8,7 @@ #include #include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_completion_callback.h" #include "db/blob/blob_index.h" #include "db/blob/blob_log_format.h" #include "db/blob/blob_log_writer.h" @@ -17,41 +18,46 @@ #include "file/writable_file_writer.h" #include "logging/logging.h" #include "options/cf_options.h" +#include "options/options_helper.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" #include "util/compression.h" namespace ROCKSDB_NAMESPACE { BlobFileBuilder::BlobFileBuilder( - VersionSet* versions, Env* env, FileSystem* fs, - const ImmutableCFOptions* immutable_cf_options, + VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, int job_id, uint32_t column_family_id, const std::string& column_family_name, Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, std::vector* blob_file_paths, std::vector* blob_file_additions) - : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, env, - fs, immutable_cf_options, mutable_cf_options, - file_options, job_id, column_family_id, - column_family_name, io_priority, write_hint, - blob_file_paths, blob_file_additions) {} + : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs, + immutable_options, mutable_cf_options, file_options, + job_id, column_family_id, column_family_name, io_priority, + write_hint, io_tracer, blob_callback, blob_file_paths, + blob_file_additions) {} BlobFileBuilder::BlobFileBuilder( - std::function file_number_generator, Env* env, FileSystem* fs, - const ImmutableCFOptions* immutable_cf_options, + std::function file_number_generator, FileSystem* fs, + const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, int job_id, uint32_t column_family_id, const std::string& column_family_name, Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, std::vector* blob_file_paths, std::vector* blob_file_additions) : file_number_generator_(std::move(file_number_generator)), - env_(env), fs_(fs), - immutable_cf_options_(immutable_cf_options), + immutable_options_(immutable_options), min_blob_size_(mutable_cf_options->min_blob_size), blob_file_size_(mutable_cf_options->blob_file_size), blob_compression_type_(mutable_cf_options->blob_compression_type), @@ -61,14 +67,15 @@ BlobFileBuilder::BlobFileBuilder( column_family_name_(column_family_name), io_priority_(io_priority), write_hint_(write_hint), + io_tracer_(io_tracer), + blob_callback_(blob_callback), blob_file_paths_(blob_file_paths), blob_file_additions_(blob_file_additions), blob_count_(0), blob_bytes_(0) { assert(file_number_generator_); - assert(env_); assert(fs_); - assert(immutable_cf_options_); + assert(immutable_options_); assert(file_options_); assert(blob_file_paths_); assert(blob_file_paths_->empty()); @@ -149,19 +156,20 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { assert(file_number_generator_); const uint64_t blob_file_number = file_number_generator_(); - assert(immutable_cf_options_); - assert(!immutable_cf_options_->cf_paths.empty()); - std::string blob_file_path = BlobFileName( - immutable_cf_options_->cf_paths.front().path, blob_file_number); + assert(immutable_options_); + assert(!immutable_options_->cf_paths.empty()); + std::string blob_file_path = + BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number); std::unique_ptr file; { - TEST_SYNC_POINT("BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile"); - assert(file_options_); - const Status s = - NewWritableFile(fs_, blob_file_path, &file, *file_options_); + Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s); + if (!s.ok()) { return s; } @@ -176,17 +184,20 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { assert(file); file->SetIOPriority(io_priority_); file->SetWriteLifeTimeHint(write_hint_); - - Statistics* const statistics = immutable_cf_options_->statistics; - + FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types; + Statistics* const statistics = immutable_options_->stats; std::unique_ptr file_writer(new WritableFileWriter( - std::move(file), blob_file_paths_->back(), *file_options_, env_, - nullptr /*IOTracer*/, statistics, immutable_cf_options_->listeners, - immutable_cf_options_->file_checksum_gen_factory)); + std::move(file), blob_file_paths_->back(), *file_options_, + immutable_options_->clock, io_tracer_, statistics, + immutable_options_->listeners, + immutable_options_->file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kBlobFile))); - std::unique_ptr blob_log_writer( - new BlobLogWriter(std::move(file_writer), env_, statistics, - blob_file_number, immutable_cf_options_->use_fsync)); + constexpr bool do_flush = false; + + std::unique_ptr blob_log_writer(new BlobLogWriter( + std::move(file_writer), immutable_options_->clock, statistics, + blob_file_number, immutable_options_->use_fsync, do_flush)); constexpr bool has_ttl = false; constexpr ExpirationRange expiration_range; @@ -195,9 +206,11 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() { expiration_range); { - TEST_SYNC_POINT("BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader"); + Status s = blob_log_writer->WriteHeader(header); + + TEST_SYNC_POINT_CALLBACK( + "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s); - const Status s = blob_log_writer->WriteHeader(header); if (!s.ok()) { return s; } @@ -247,9 +260,10 @@ Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob, uint64_t key_offset = 0; - TEST_SYNC_POINT("BlobFileBuilder::WriteBlobToFile:AddRecord"); + Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s); - const Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset); if (!s.ok()) { return s; } @@ -271,10 +285,10 @@ Status BlobFileBuilder::CloseBlobFile() { std::string checksum_method; std::string checksum_value; - TEST_SYNC_POINT("BlobFileBuilder::WriteBlobToFile:AppendFooter"); + Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value); + + TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s); - const Status s = - writer_->AppendFooter(footer, &checksum_method, &checksum_value); if (!s.ok()) { return s; } @@ -286,18 +300,21 @@ Status BlobFileBuilder::CloseBlobFile() { std::move(checksum_method), std::move(checksum_value)); - assert(immutable_cf_options_); - ROCKS_LOG_INFO(immutable_cf_options_->info_log, + assert(immutable_options_); + ROCKS_LOG_INFO(immutable_options_->logger, "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64 " total blobs, %" PRIu64 " total bytes", column_family_name_.c_str(), job_id_, blob_file_number, blob_count_, blob_bytes_); + if (blob_callback_) { + s = blob_callback_->OnBlobFileCompleted(blob_file_paths_->back()); + } writer_.reset(); blob_count_ = 0; blob_bytes_ = 0; - return Status::OK(); + return s; } Status BlobFileBuilder::CloseBlobFileIfNeeded() { @@ -313,4 +330,20 @@ Status BlobFileBuilder::CloseBlobFileIfNeeded() { return CloseBlobFile(); } +void BlobFileBuilder::Abandon() { + if (!IsBlobFileOpen()) { + return; + } + + if (blob_callback_) { + // BlobFileBuilder::Abandon() is called because of error while writing to + // Blob files. So we can ignore the below error. + blob_callback_->OnBlobFileCompleted(blob_file_paths_->back()) + .PermitUncheckedError(); + } + + writer_.reset(); + blob_count_ = 0; + blob_bytes_ = 0; +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h index 755ab435094..0929b6a7758 100644 --- a/db/blob/blob_file_builder.h +++ b/db/blob/blob_file_builder.h @@ -18,36 +18,42 @@ namespace ROCKSDB_NAMESPACE { class VersionSet; class FileSystem; -struct ImmutableCFOptions; +class SystemClock; +struct ImmutableOptions; struct MutableCFOptions; struct FileOptions; class BlobFileAddition; class Status; class Slice; class BlobLogWriter; +class IOTracer; +class BlobFileCompletionCallback; class BlobFileBuilder { public: - BlobFileBuilder(VersionSet* versions, Env* env, FileSystem* fs, - const ImmutableCFOptions* immutable_cf_options, + BlobFileBuilder(VersionSet* versions, FileSystem* fs, + const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, int job_id, uint32_t column_family_id, const std::string& column_family_name, Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, std::vector* blob_file_paths, std::vector* blob_file_additions); - BlobFileBuilder(std::function file_number_generator, Env* env, - FileSystem* fs, - const ImmutableCFOptions* immutable_cf_options, + BlobFileBuilder(std::function file_number_generator, + FileSystem* fs, const ImmutableOptions* immutable_options, const MutableCFOptions* mutable_cf_options, const FileOptions* file_options, int job_id, uint32_t column_family_id, const std::string& column_family_name, Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint, + const std::shared_ptr& io_tracer, + BlobFileCompletionCallback* blob_callback, std::vector* blob_file_paths, std::vector* blob_file_additions); @@ -58,6 +64,7 @@ class BlobFileBuilder { Status Add(const Slice& key, const Slice& value, std::string* blob_index); Status Finish(); + void Abandon(); private: bool IsBlobFileOpen() const; @@ -69,9 +76,8 @@ class BlobFileBuilder { Status CloseBlobFileIfNeeded(); std::function file_number_generator_; - Env* env_; FileSystem* fs_; - const ImmutableCFOptions* immutable_cf_options_; + const ImmutableOptions* immutable_options_; uint64_t min_blob_size_; uint64_t blob_file_size_; CompressionType blob_compression_type_; @@ -81,6 +87,8 @@ class BlobFileBuilder { std::string column_family_name_; Env::IOPriority io_priority_; Env::WriteLifeTimeHint write_hint_; + std::shared_ptr io_tracer_; + BlobFileCompletionCallback* blob_callback_; std::vector* blob_file_paths_; std::vector* blob_file_additions_; std::unique_ptr writer_; diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc index 72e9ac47b98..08cfac00754 100644 --- a/db/blob/blob_file_builder_test.cc +++ b/db/blob/blob_file_builder_test.cc @@ -15,7 +15,6 @@ #include "db/blob/blob_index.h" #include "db/blob/blob_log_format.h" #include "db/blob/blob_log_sequential_reader.h" -#include "env/composite_env_wrapper.h" #include "env/mock_env.h" #include "file/filename.h" #include "file/random_access_file_reader.h" @@ -40,7 +39,10 @@ class TestFileNumberGenerator { class BlobFileBuilderTest : public testing::Test { protected: - BlobFileBuilderTest() : mock_env_(Env::Default()), fs_(&mock_env_) {} + BlobFileBuilderTest() : mock_env_(Env::Default()) { + fs_ = mock_env_.GetFileSystem().get(); + clock_ = mock_env_.GetSystemClock().get(); + } void VerifyBlobFile(uint64_t blob_file_number, const std::string& blob_file_path, @@ -54,14 +56,13 @@ class BlobFileBuilderTest : public testing::Test { std::unique_ptr file; constexpr IODebugContext* dbg = nullptr; ASSERT_OK( - fs_.NewRandomAccessFile(blob_file_path, file_options_, &file, dbg)); + fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg)); std::unique_ptr file_reader( - new RandomAccessFileReader(std::move(file), blob_file_path, - &mock_env_)); + new RandomAccessFileReader(std::move(file), blob_file_path, clock_)); constexpr Statistics* statistics = nullptr; - BlobLogSequentialReader blob_log_reader(std::move(file_reader), &mock_env_, + BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_, statistics); BlobLogHeader header; @@ -108,7 +109,8 @@ class BlobFileBuilderTest : public testing::Test { } MockEnv mock_env_; - LegacyFileSystemWrapper fs_; + FileSystem* fs_; + SystemClock* clock_; FileOptions file_options_; }; @@ -125,8 +127,9 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { "BlobFileBuilderTest_BuildAndCheckOneFile"), 0); options.enable_blob_files = true; + options.env = &mock_env_; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -138,11 +141,11 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_, - &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); std::vector> expected_key_value_pairs( number_of_blobs); @@ -174,9 +177,9 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) { const std::string& blob_file_path = blob_file_paths[0]; - ASSERT_EQ(blob_file_path, - BlobFileName(immutable_cf_options.cf_paths.front().path, - blob_file_number)); + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); ASSERT_EQ(blob_file_additions.size(), 1); @@ -208,8 +211,9 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { 0); options.enable_blob_files = true; options.blob_file_size = value_size; + options.env = &mock_env_; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -221,11 +225,11 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_, - &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); std::vector> expected_key_value_pairs( number_of_blobs); @@ -258,7 +262,7 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) { const uint64_t blob_file_number = i + 2; ASSERT_EQ(blob_file_paths[i], - BlobFileName(immutable_cf_options.cf_paths.front().path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); const auto& blob_file_addition = blob_file_additions[i]; @@ -293,8 +297,9 @@ TEST_F(BlobFileBuilderTest, InlinedValues) { 0); options.enable_blob_files = true; options.min_blob_size = 1024; + options.env = &mock_env_; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -306,11 +311,11 @@ TEST_F(BlobFileBuilderTest, InlinedValues) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_, - &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); for (size_t i = 0; i < number_of_blobs; ++i) { const std::string key = std::to_string(i); @@ -345,8 +350,9 @@ TEST_F(BlobFileBuilderTest, Compression) { test::PerThreadDBPath(&mock_env_, "BlobFileBuilderTest_Compression"), 0); options.enable_blob_files = true; options.blob_compression_type = kSnappyCompression; + options.env = &mock_env_; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -358,11 +364,11 @@ TEST_F(BlobFileBuilderTest, Compression) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_, - &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); const std::string key("1"); const std::string uncompressed_value(value_size, 'x'); @@ -381,9 +387,9 @@ TEST_F(BlobFileBuilderTest, Compression) { const std::string& blob_file_path = blob_file_paths[0]; - ASSERT_EQ(blob_file_path, - BlobFileName(immutable_cf_options.cf_paths.front().path, - blob_file_number)); + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); ASSERT_EQ(blob_file_additions.size(), 1); @@ -427,8 +433,8 @@ TEST_F(BlobFileBuilderTest, CompressionError) { 0); options.enable_blob_files = true; options.blob_compression_type = kSnappyCompression; - - ImmutableCFOptions immutable_cf_options(options); + options.env = &mock_env_; + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -440,11 +446,11 @@ TEST_F(BlobFileBuilderTest, CompressionError) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_, - &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue", [](void* arg) { @@ -466,9 +472,9 @@ TEST_F(BlobFileBuilderTest, CompressionError) { constexpr uint64_t blob_file_number = 2; ASSERT_EQ(blob_file_paths.size(), 1); - ASSERT_EQ(blob_file_paths[0], - BlobFileName(immutable_cf_options.cf_paths.front().path, - blob_file_number)); + ASSERT_EQ( + blob_file_paths[0], + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); ASSERT_TRUE(blob_file_additions.empty()); } @@ -504,8 +510,9 @@ TEST_F(BlobFileBuilderTest, Checksum) { options.enable_blob_files = true; options.file_checksum_gen_factory = std::make_shared(); + options.env = &mock_env_; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -517,11 +524,11 @@ TEST_F(BlobFileBuilderTest, Checksum) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_, - &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); const std::string key("1"); const std::string value("deadbeef"); @@ -540,9 +547,9 @@ TEST_F(BlobFileBuilderTest, Checksum) { const std::string& blob_file_path = blob_file_paths[0]; - ASSERT_EQ(blob_file_path, - BlobFileName(immutable_cf_options.cf_paths.front().path, - blob_file_number)); + ASSERT_EQ( + blob_file_path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); ASSERT_EQ(blob_file_additions.size(), 1); @@ -570,13 +577,11 @@ class BlobFileBuilderIOErrorTest protected: BlobFileBuilderIOErrorTest() : mock_env_(Env::Default()), - fault_injection_env_(&mock_env_), - fs_(&fault_injection_env_), + fs_(mock_env_.GetFileSystem().get()), sync_point_(GetParam()) {} MockEnv mock_env_; - FaultInjectionTestEnv fault_injection_env_; - LegacyFileSystemWrapper fs_; + FileSystem* fs_; FileOptions file_options_; std::string sync_point_; }; @@ -597,13 +602,13 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { Options options; options.cf_paths.emplace_back( - test::PerThreadDBPath(&fault_injection_env_, - "BlobFileBuilderIOErrorTest_IOError"), + test::PerThreadDBPath(&mock_env_, "BlobFileBuilderIOErrorTest_IOError"), 0); options.enable_blob_files = true; options.blob_file_size = value_size; + options.env = &mock_env_; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); MutableCFOptions mutable_cf_options(options); constexpr int job_id = 1; @@ -615,15 +620,17 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { std::vector blob_file_paths; std::vector blob_file_additions; - BlobFileBuilder builder(TestFileNumberGenerator(), &fault_injection_env_, - &fs_, &immutable_cf_options, &mutable_cf_options, - &file_options_, job_id, column_family_id, - column_family_name, io_priority, write_hint, - &blob_file_paths, &blob_file_additions); + BlobFileBuilder builder( + TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options, + &file_options_, job_id, column_family_id, column_family_name, io_priority, + write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/, + &blob_file_paths, &blob_file_additions); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); - SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { - fault_injection_env_.SetFilesystemActive(false, - Status::IOError(sync_point_)); + (*s) = Status::IOError(sync_point_); }); SyncPoint::GetInstance()->EnableProcessing(); @@ -644,7 +651,7 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) { ASSERT_EQ(blob_file_paths.size(), 1); ASSERT_EQ(blob_file_paths[0], - BlobFileName(immutable_cf_options.cf_paths.front().path, + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number)); } diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc new file mode 100644 index 00000000000..1a6cdf6880c --- /dev/null +++ b/db/blob/blob_file_cache.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_file_reader.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "trace_replay/io_tracer.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +BlobFileCache::BlobFileCache(Cache* cache, + const ImmutableOptions* immutable_options, + const FileOptions* file_options, + uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer) + : cache_(cache), + mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr), + immutable_options_(immutable_options), + file_options_(file_options), + column_family_id_(column_family_id), + blob_file_read_hist_(blob_file_read_hist), + io_tracer_(io_tracer) { + assert(cache_); + assert(immutable_options_); + assert(file_options_); +} + +Status BlobFileCache::GetBlobFileReader( + uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader) { + assert(blob_file_reader); + assert(blob_file_reader->IsEmpty()); + + const Slice key = GetSlice(&blob_file_number); + + assert(cache_); + + Cache::Handle* handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard(cache_, handle); + return Status::OK(); + } + + TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck"); + + // Check again while holding mutex + MutexLock lock(mutex_.get(key)); + + handle = cache_->Lookup(key); + if (handle) { + *blob_file_reader = CacheHandleGuard(cache_, handle); + return Status::OK(); + } + + assert(immutable_options_); + Statistics* const statistics = immutable_options_->stats; + + RecordTick(statistics, NO_FILE_OPENS); + + std::unique_ptr reader; + + { + assert(file_options_); + const Status s = BlobFileReader::Create( + *immutable_options_, *file_options_, column_family_id_, + blob_file_read_hist_, blob_file_number, io_tracer_, &reader); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + { + constexpr size_t charge = 1; + + const Status s = cache_->Insert(key, reader.get(), charge, + &DeleteCacheEntry, &handle); + if (!s.ok()) { + RecordTick(statistics, NO_FILE_ERRORS); + return s; + } + } + + reader.release(); + + *blob_file_reader = CacheHandleGuard(cache_, handle); + + return Status::OK(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h new file mode 100644 index 00000000000..8eec05f184e --- /dev/null +++ b/db/blob/blob_file_cache.h @@ -0,0 +1,52 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "cache/cache_helpers.h" +#include "rocksdb/rocksdb_namespace.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +class Cache; +struct ImmutableOptions; +struct FileOptions; +class HistogramImpl; +class Status; +class BlobFileReader; +class Slice; +class IOTracer; + +class BlobFileCache { + public: + BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options, + const FileOptions* file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, + const std::shared_ptr& io_tracer); + + BlobFileCache(const BlobFileCache&) = delete; + BlobFileCache& operator=(const BlobFileCache&) = delete; + + Status GetBlobFileReader(uint64_t blob_file_number, + CacheHandleGuard* blob_file_reader); + + private: + Cache* cache_; + // Note: mutex_ below is used to guard against multiple threads racing to open + // the same file. + Striped mutex_; + const ImmutableOptions* immutable_options_; + const FileOptions* file_options_; + uint32_t column_family_id_; + HistogramImpl* blob_file_read_hist_; + std::shared_ptr io_tracer_; + + static constexpr size_t kNumberOfMutexStripes = 1 << 7; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc new file mode 100644 index 00000000000..bef2d6202e7 --- /dev/null +++ b/db/blob/blob_file_cache_test.cc @@ -0,0 +1,267 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_file_cache.h" + +#include +#include + +#include "db/blob/blob_log_format.h" +#include "db/blob/blob_log_writer.h" +#include "env/mock_env.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" +#include "options/cf_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +// Creates a test blob file with a single blob in it. +void WriteBlobFile(uint32_t column_family_id, + const ImmutableOptions& immutable_options, + uint64_t blob_file_number) { + assert(!immutable_options.cf_paths.empty()); + + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); + + std::unique_ptr file; + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, + FileOptions())); + + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); + + constexpr Statistics* statistics = nullptr; + constexpr bool use_fsync = false; + constexpr bool do_flush = false; + + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); + + constexpr bool has_ttl = false; + constexpr ExpirationRange expiration_range; + + BlobLogHeader header(column_family_id, kNoCompression, has_ttl, + expiration_range); + + ASSERT_OK(blob_log_writer.WriteHeader(header)); + + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + + std::string compressed_blob; + + uint64_t key_offset = 0; + uint64_t blob_offset = 0; + + ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset)); + + BlobLogFooter footer; + footer.blob_count = 1; + footer.expiration_range = expiration_range; + + std::string checksum_method; + std::string checksum_value; + + ASSERT_OK( + blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value)); +} + +} // anonymous namespace + +class BlobFileCacheTest : public testing::Test { + protected: + BlobFileCacheTest() : mock_env_(Env::Default()) {} + + MockEnv mock_env_; +}; + +TEST_F(BlobFileCacheTest, GetBlobFileReader) { + Options options; + options.env = &mock_env_; + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(&mock_env_, "BlobFileCacheTest_GetBlobFileReader"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // First try: reader should be opened and put in cache + CacheHandleGuard first; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + // Second try: reader should be served from cache + CacheHandleGuard second; + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { + Options options; + options.env = &mock_env_; + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(&mock_env_, + "BlobFileCacheTest_GetBlobFileReader_Race"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + CacheHandleGuard first; + CacheHandleGuard second; + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { + // Disabling sync points to prevent infinite recursion + SyncPoint::GetInstance()->DisableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_NE(second.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_NE(first.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); + + ASSERT_EQ(first.GetValue(), second.GetValue()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { + Options options; + options.env = &mock_env_; + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(&mock_env_, + "BlobFileCacheTest_GetBlobFileReader_IOError"), + 0); + options.enable_blob_files = true; + + constexpr size_t capacity = 10; + std::shared_ptr backing_cache = NewLRUCache(capacity); + + ImmutableOptions immutable_options(options); + FileOptions file_options; + constexpr uint32_t column_family_id = 1; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Note: there is no blob file with the below number + constexpr uint64_t blob_file_number = 123; + + CacheHandleGuard reader; + + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { + Options options; + options.env = &mock_env_; + options.statistics = CreateDBStatistics(); + options.cf_paths.emplace_back( + test::PerThreadDBPath(&mock_env_, + "BlobFileCacheTest_GetBlobFileReader_CacheFull"), + 0); + options.enable_blob_files = true; + + constexpr uint32_t column_family_id = 1; + ImmutableOptions immutable_options(options); + constexpr uint64_t blob_file_number = 123; + + WriteBlobFile(column_family_id, immutable_options, blob_file_number); + + constexpr size_t capacity = 0; + constexpr int num_shard_bits = -1; // determined automatically + constexpr bool strict_capacity_limit = true; + std::shared_ptr backing_cache = + NewLRUCache(capacity, num_shard_bits, strict_capacity_limit); + + FileOptions file_options; + constexpr HistogramImpl* blob_file_read_hist = nullptr; + + BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options, + &file_options, column_family_id, + blob_file_read_hist, nullptr /*IOTracer*/); + + // Insert into cache should fail since it has zero capacity and + // strict_capacity_limit is set + CacheHandleGuard reader; + + ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) + .IsIncomplete()); + ASSERT_EQ(reader.GetValue(), nullptr); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); + ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/blob/blob_file_completion_callback.h b/db/blob/blob_file_completion_callback.h new file mode 100644 index 00000000000..42b6def893c --- /dev/null +++ b/db/blob/blob_file_completion_callback.h @@ -0,0 +1,57 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "db/error_handler.h" +#include "file/sst_file_manager_impl.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class BlobFileCompletionCallback { + public: +#ifdef ROCKSDB_LITE + BlobFileCompletionCallback(SstFileManager* /*sst_file_manager*/, + InstrumentedMutex* /*mutex*/, + ErrorHandler* /*error_handler*/) {} + Status OnBlobFileCompleted(const std::string& /*file_name*/) { + return Status::OK(); + } +#else + BlobFileCompletionCallback(SstFileManager* sst_file_manager, + InstrumentedMutex* mutex, + ErrorHandler* error_handler) + : sst_file_manager_(sst_file_manager), + mutex_(mutex), + error_handler_(error_handler) {} + + Status OnBlobFileCompleted(const std::string& file_name) { + Status s; + auto sfm = static_cast(sst_file_manager_); + if (sfm) { + // Report new blob files to SstFileManagerImpl + s = sfm->OnAddFile(file_name); + if (sfm->IsMaxAllowedSpaceReached()) { + s = Status::SpaceLimit("Max allowed space was reached"); + TEST_SYNC_POINT( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached"); + InstrumentedMutexLock l(mutex_); + error_handler_->SetBGError(s, BackgroundErrorReason::kFlush); + } + } + return s; + } + + private: + SstFileManager* sst_file_manager_; + InstrumentedMutex* mutex_; + ErrorHandler* error_handler_; +#endif // ROCKSDB_LITE +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc index 0cae4eb5341..1b4b82150da 100644 --- a/db/blob/blob_file_reader.cc +++ b/db/blob/blob_file_reader.cc @@ -21,9 +21,9 @@ namespace ROCKSDB_NAMESPACE { Status BlobFileReader::Create( - const ImmutableCFOptions& immutable_cf_options, - const FileOptions& file_options, uint32_t column_family_id, - HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const ImmutableOptions& immutable_options, const FileOptions& file_options, + uint32_t column_family_id, HistogramImpl* blob_file_read_hist, + uint64_t blob_file_number, const std::shared_ptr& io_tracer, std::unique_ptr* blob_file_reader) { assert(blob_file_reader); assert(!*blob_file_reader); @@ -33,8 +33,8 @@ Status BlobFileReader::Create( { const Status s = - OpenFile(immutable_cf_options, file_options, blob_file_read_hist, - blob_file_number, &file_size, &file_reader); + OpenFile(immutable_options, file_options, blob_file_read_hist, + blob_file_number, io_tracer, &file_size, &file_reader); if (!s.ok()) { return s; } @@ -66,20 +66,20 @@ Status BlobFileReader::Create( } Status BlobFileReader::OpenFile( - const ImmutableCFOptions& immutable_cf_options, - const FileOptions& file_opts, HistogramImpl* blob_file_read_hist, - uint64_t blob_file_number, uint64_t* file_size, + const ImmutableOptions& immutable_options, const FileOptions& file_opts, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, uint64_t* file_size, std::unique_ptr* file_reader) { assert(file_size); assert(file_reader); - const auto& cf_paths = immutable_cf_options.cf_paths; + const auto& cf_paths = immutable_options.cf_paths; assert(!cf_paths.empty()); const std::string blob_file_path = BlobFileName(cf_paths.front().path, blob_file_number); - FileSystem* const fs = immutable_cf_options.fs; + FileSystem* const fs = immutable_options.fs.get(); assert(fs); constexpr IODebugContext* dbg = nullptr; @@ -112,15 +112,15 @@ Status BlobFileReader::OpenFile( assert(file); - if (immutable_cf_options.advise_random_on_open) { + if (immutable_options.advise_random_on_open) { file->Hint(FSRandomAccessFile::kRandom); } file_reader->reset(new RandomAccessFileReader( - std::move(file), blob_file_path, immutable_cf_options.env, - std::shared_ptr(), immutable_cf_options.statistics, - BLOB_DB_BLOB_FILE_READ_MICROS, blob_file_read_hist, - immutable_cf_options.rate_limiter, immutable_cf_options.listeners)); + std::move(file), blob_file_path, immutable_options.clock, io_tracer, + immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS, + blob_file_read_hist, immutable_options.rate_limiter.get(), + immutable_options.listeners)); return Status::OK(); } @@ -269,7 +269,8 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options, const Slice& user_key, uint64_t offset, uint64_t value_size, CompressionType compression_type, - PinnableSlice* value) const { + PinnableSlice* value, + uint64_t* bytes_read) const { assert(value); const uint64_t key_size = user_key.size(); @@ -292,6 +293,9 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options, : 0; assert(offset >= adjustment); + const uint64_t record_offset = offset - adjustment; + const uint64_t record_size = value_size + adjustment; + Slice record_slice; Buffer buf; AlignedBuf aligned_buf; @@ -299,9 +303,6 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options, { TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile"); - const uint64_t record_offset = offset - adjustment; - const uint64_t record_size = value_size + adjustment; - const Status s = ReadFromFile(file_reader_.get(), record_offset, static_cast(record_size), &record_slice, &buf, &aligned_buf); @@ -330,6 +331,10 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options, } } + if (bytes_read) { + *bytes_read = record_size; + } + return Status::OK(); } diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h index 8c7df393df9..9b3f5ebd624 100644 --- a/db/blob/blob_file_reader.h +++ b/db/blob/blob_file_reader.h @@ -15,7 +15,7 @@ namespace ROCKSDB_NAMESPACE { class Status; -struct ImmutableCFOptions; +struct ImmutableOptions; struct FileOptions; class HistogramImpl; struct ReadOptions; @@ -24,11 +24,12 @@ class PinnableSlice; class BlobFileReader { public: - static Status Create(const ImmutableCFOptions& immutable_cf_options, + static Status Create(const ImmutableOptions& immutable_options, const FileOptions& file_options, uint32_t column_family_id, HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, std::unique_ptr* reader); BlobFileReader(const BlobFileReader&) = delete; @@ -38,16 +39,19 @@ class BlobFileReader { Status GetBlob(const ReadOptions& read_options, const Slice& user_key, uint64_t offset, uint64_t value_size, - CompressionType compression_type, PinnableSlice* value) const; + CompressionType compression_type, PinnableSlice* value, + uint64_t* bytes_read) const; private: BlobFileReader(std::unique_ptr&& file_reader, uint64_t file_size, CompressionType compression_type); - static Status OpenFile(const ImmutableCFOptions& immutable_cf_options, + static Status OpenFile(const ImmutableOptions& immutable_options, const FileOptions& file_opts, HistogramImpl* blob_file_read_hist, - uint64_t blob_file_number, uint64_t* file_size, + uint64_t blob_file_number, + const std::shared_ptr& io_tracer, + uint64_t* file_size, std::unique_ptr* file_reader); static Status ReadHeader(const RandomAccessFileReader* file_reader, diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index e8af662f1eb..e08a4bab836 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -18,6 +18,7 @@ #include "rocksdb/env.h" #include "rocksdb/file_system.h" #include "rocksdb/options.h" +#include "test_util/sync_point.h" #include "test_util/testharness.h" #include "util/compression.h" #include "utilities/fault_injection_env.h" @@ -29,34 +30,34 @@ namespace { // Creates a test blob file with a single blob in it. Note: this method // makes it possible to test various corner cases by allowing the caller // to specify the contents of various blob file header/footer fields. -void WriteBlobFile(const ImmutableCFOptions& immutable_cf_options, +void WriteBlobFile(const ImmutableOptions& immutable_options, uint32_t column_family_id, bool has_ttl, const ExpirationRange& expiration_range_header, const ExpirationRange& expiration_range_footer, uint64_t blob_file_number, const Slice& key, const Slice& blob, CompressionType compression_type, uint64_t* blob_offset, uint64_t* blob_size) { - assert(!immutable_cf_options.cf_paths.empty()); + assert(!immutable_options.cf_paths.empty()); assert(blob_offset); assert(blob_size); - const std::string blob_file_path = BlobFileName( - immutable_cf_options.cf_paths.front().path, blob_file_number); + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); std::unique_ptr file; - ASSERT_OK(NewWritableFile(immutable_cf_options.fs, blob_file_path, &file, + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, FileOptions())); - std::unique_ptr file_writer( - new WritableFileWriter(std::move(file), blob_file_path, FileOptions(), - immutable_cf_options.env)); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(file), blob_file_path, FileOptions(), immutable_options.clock)); constexpr Statistics* statistics = nullptr; constexpr bool use_fsync = false; + constexpr bool do_flush = false; - BlobLogWriter blob_log_writer(std::move(file_writer), - immutable_cf_options.env, statistics, - blob_file_number, use_fsync); + BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock, + statistics, blob_file_number, use_fsync, + do_flush); BlobLogHeader header(column_family_id, compression_type, has_ttl, expiration_range_header); @@ -120,7 +121,7 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -132,17 +133,17 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, - kNoCompression, &blob_offset, &blob_size); + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, &reader)); + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification ReadOptions read_options; @@ -150,83 +151,103 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, - kNoCompression, &value)); + kNoCompression, &value, &bytes_read)); ASSERT_EQ(value, blob); + ASSERT_EQ(bytes_read, blob_size); } read_options.verify_checksums = true; { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, - kNoCompression, &value)); + kNoCompression, &value, &bytes_read)); ASSERT_EQ(value, blob); + + constexpr uint64_t key_size = sizeof(key) - 1; + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size); } // Invalid offset (too close to start of file) { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(read_options, key, blob_offset - 1, blob_size, - kNoCompression, &value) + kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } // Invalid offset (too close to end of file) { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(read_options, key, blob_offset + 1, blob_size, - kNoCompression, &value) + kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } // Incorrect compression type { PinnableSlice value; + uint64_t bytes_read = 0; - ASSERT_TRUE( - reader - ->GetBlob(read_options, key, blob_offset, blob_size, kZSTD, &value) - .IsCorruption()); + ASSERT_TRUE(reader + ->GetBlob(read_options, key, blob_offset, blob_size, kZSTD, + &value, &bytes_read) + .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } // Incorrect key size { constexpr char shorter_key[] = "k"; PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(read_options, shorter_key, blob_offset - (sizeof(key) - sizeof(shorter_key)), - blob_size, kNoCompression, &value) + blob_size, kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } // Incorrect key { constexpr char incorrect_key[] = "foo"; PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(read_options, incorrect_key, blob_offset, - blob_size, kNoCompression, &value) + blob_size, kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } // Incorrect value size { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(read_options, key, blob_offset, blob_size + 1, - kNoCompression, &value) + kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } } @@ -240,7 +261,7 @@ TEST_F(BlobFileReaderTest, Malformed) { test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_Malformed"), 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr uint64_t blob_file_number = 1; @@ -249,23 +270,24 @@ TEST_F(BlobFileReaderTest, Malformed) { constexpr bool has_ttl = false; constexpr ExpirationRange expiration_range; - const std::string blob_file_path = BlobFileName( - immutable_cf_options.cf_paths.front().path, blob_file_number); + const std::string blob_file_path = + BlobFileName(immutable_options.cf_paths.front().path, blob_file_number); std::unique_ptr file; - ASSERT_OK(NewWritableFile(immutable_cf_options.fs, blob_file_path, &file, + ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file, FileOptions())); std::unique_ptr file_writer( new WritableFileWriter(std::move(file), blob_file_path, FileOptions(), - immutable_cf_options.env)); + immutable_options.clock)); constexpr Statistics* statistics = nullptr; constexpr bool use_fsync = false; + constexpr bool do_flush = false; BlobLogWriter blob_log_writer(std::move(file_writer), - immutable_cf_options.env, statistics, - blob_file_number, use_fsync); + immutable_options.clock, statistics, + blob_file_number, use_fsync, do_flush); BlobLogHeader header(column_family_id, kNoCompression, has_ttl, expiration_range); @@ -277,9 +299,10 @@ TEST_F(BlobFileReaderTest, Malformed) { std::unique_ptr reader; - ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(), + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, &reader) + blob_file_number, nullptr /*IOTracer*/, + &reader) .IsCorruption()); } @@ -290,7 +313,7 @@ TEST_F(BlobFileReaderTest, TTL) { test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_TTL"), 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = true; @@ -302,17 +325,18 @@ TEST_F(BlobFileReaderTest, TTL) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, - kNoCompression, &blob_offset, &blob_size); + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(), + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, &reader) + blob_file_number, nullptr /*IOTracer*/, + &reader) .IsCorruption()); } @@ -325,7 +349,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -339,7 +363,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range_header, expiration_range_footer, blob_file_number, key, blob, kNoCompression, &blob_offset, &blob_size); @@ -348,9 +372,10 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { std::unique_ptr reader; - ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(), + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, &reader) + blob_file_number, nullptr /*IOTracer*/, + &reader) .IsCorruption()); } @@ -363,7 +388,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -377,7 +402,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range_header, expiration_range_footer, blob_file_number, key, blob, kNoCompression, &blob_offset, &blob_size); @@ -386,9 +411,10 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { std::unique_ptr reader; - ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(), + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, &reader) + blob_file_number, nullptr /*IOTracer*/, + &reader) .IsCorruption()); } @@ -401,7 +427,7 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -413,9 +439,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, - kNoCompression, &blob_offset, &blob_size); + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); constexpr HistogramImpl* blob_file_read_hist = nullptr; @@ -423,10 +449,10 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { constexpr uint32_t incorrect_column_family_id = 2; - ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(), + ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), incorrect_column_family_id, blob_file_read_hist, blob_file_number, - &reader) + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -437,7 +463,7 @@ TEST_F(BlobFileReaderTest, BlobCRCError) { test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_BlobCRCError"), 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -449,17 +475,17 @@ TEST_F(BlobFileReaderTest, BlobCRCError) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, - kNoCompression, &blob_offset, &blob_size); + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, &reader)); + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { @@ -472,11 +498,13 @@ TEST_F(BlobFileReaderTest, BlobCRCError) { SyncPoint::GetInstance()->EnableProcessing(); PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(ReadOptions(), key, blob_offset, blob_size, - kNoCompression, &value) + kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -493,7 +521,7 @@ TEST_F(BlobFileReaderTest, Compression) { test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_Compression"), 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -505,17 +533,17 @@ TEST_F(BlobFileReaderTest, Compression) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kSnappyCompression, &blob_offset, &blob_size); constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, &reader)); + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification ReadOptions read_options; @@ -523,20 +551,28 @@ TEST_F(BlobFileReaderTest, Compression) { { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, - kSnappyCompression, &value)); + kSnappyCompression, &value, &bytes_read)); ASSERT_EQ(value, blob); + ASSERT_EQ(bytes_read, blob_size); } read_options.verify_checksums = true; { PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size, - kSnappyCompression, &value)); + kSnappyCompression, &value, &bytes_read)); ASSERT_EQ(value, blob); + + constexpr uint64_t key_size = sizeof(key) - 1; + ASSERT_EQ(bytes_read, + BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + + blob_size); } } @@ -553,7 +589,7 @@ TEST_F(BlobFileReaderTest, UncompressionError) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -565,17 +601,17 @@ TEST_F(BlobFileReaderTest, UncompressionError) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kSnappyCompression, &blob_offset, &blob_size); constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, &reader)); + ASSERT_OK(BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { @@ -589,11 +625,13 @@ TEST_F(BlobFileReaderTest, UncompressionError) { SyncPoint::GetInstance()->EnableProcessing(); PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(ReadOptions(), key, blob_offset, blob_size, - kSnappyCompression, &value) + kSnappyCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -632,7 +670,7 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -644,9 +682,9 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, - kNoCompression, &blob_offset, &blob_size); + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { fault_injection_env_.SetFilesystemActive(false, @@ -658,9 +696,9 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { std::unique_ptr reader; - const Status s = BlobFileReader::Create(immutable_cf_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, &reader); + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); @@ -671,11 +709,13 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { ASSERT_OK(s); PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(ReadOptions(), key, blob_offset, blob_size, - kNoCompression, &value) + kNoCompression, &value, &bytes_read) .IsIOError()); + ASSERT_EQ(bytes_read, 0); } SyncPoint::GetInstance()->DisableProcessing(); @@ -708,7 +748,7 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { 0); options.enable_blob_files = true; - ImmutableCFOptions immutable_cf_options(options); + ImmutableOptions immutable_options(options); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -720,9 +760,9 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { uint64_t blob_offset = 0; uint64_t blob_size = 0; - WriteBlobFile(immutable_cf_options, column_family_id, has_ttl, - expiration_range, expiration_range, blob_file_number, key, blob, - kNoCompression, &blob_offset, &blob_size); + WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range, + expiration_range, blob_file_number, key, blob, kNoCompression, + &blob_offset, &blob_size); SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) { Slice* const slice = static_cast(arg); @@ -738,9 +778,9 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { std::unique_ptr reader; - const Status s = BlobFileReader::Create(immutable_cf_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, &reader); + const Status s = BlobFileReader::Create( + immutable_options, FileOptions(), column_family_id, blob_file_read_hist, + blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; @@ -751,11 +791,13 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { ASSERT_OK(s); PinnableSlice value; + uint64_t bytes_read = 0; ASSERT_TRUE(reader ->GetBlob(ReadOptions(), key, blob_offset, blob_size, - kNoCompression, &value) + kNoCompression, &value, &bytes_read) .IsCorruption()); + ASSERT_EQ(bytes_read, 0); } SyncPoint::GetInstance()->DisableProcessing(); diff --git a/db/blob/blob_log_format.cc b/db/blob/blob_log_format.cc index b5cd0bdcc77..482bd078e6b 100644 --- a/db/blob/blob_log_format.cc +++ b/db/blob/blob_log_format.cc @@ -95,10 +95,6 @@ Status BlobLogFooter::DecodeFrom(Slice src) { return Status::OK(); } -uint64_t BlobLogRecord::CalculateAdjustmentForRecordHeader(uint64_t key_size) { - return key_size + kHeaderSize; -} - void BlobLogRecord::EncodeHeaderTo(std::string* dst) { assert(dst != nullptr); dst->clear(); diff --git a/db/blob/blob_log_format.h b/db/blob/blob_log_format.h index afeb8d37090..539bbb52613 100644 --- a/db/blob/blob_log_format.h +++ b/db/blob/blob_log_format.h @@ -107,7 +107,9 @@ struct BlobLogRecord { // Note that the offset field of BlobIndex actually points to the blob value // as opposed to the start of the blob record. The following method can // be used to calculate the adjustment needed to read the blob record header. - static uint64_t CalculateAdjustmentForRecordHeader(uint64_t key_size); + static uint64_t CalculateAdjustmentForRecordHeader(uint64_t key_size) { + return key_size + kHeaderSize; + } uint64_t key_size = 0; uint64_t value_size = 0; diff --git a/db/blob/blob_log_sequential_reader.cc b/db/blob/blob_log_sequential_reader.cc index 58afd27a9c6..448b3b6f7d6 100644 --- a/db/blob/blob_log_sequential_reader.cc +++ b/db/blob/blob_log_sequential_reader.cc @@ -6,8 +6,6 @@ #include "db/blob/blob_log_sequential_reader.h" -#include - #include "file/random_access_file_reader.h" #include "monitoring/statistics.h" #include "util/stop_watch.h" @@ -15,10 +13,10 @@ namespace ROCKSDB_NAMESPACE { BlobLogSequentialReader::BlobLogSequentialReader( - std::unique_ptr&& file_reader, Env* env, + std::unique_ptr&& file_reader, SystemClock* clock, Statistics* statistics) : file_(std::move(file_reader)), - env_(env), + clock_(clock), statistics_(statistics), next_byte_(0) {} @@ -29,7 +27,7 @@ Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice, assert(slice); assert(file_); - StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); + StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); Status s = file_->Read(IOOptions(), next_byte_, static_cast(size), slice, buf, nullptr); next_byte_ += size; diff --git a/db/blob/blob_log_sequential_reader.h b/db/blob/blob_log_sequential_reader.h index 62c707b4dd8..f8e1c02bdbc 100644 --- a/db/blob/blob_log_sequential_reader.h +++ b/db/blob/blob_log_sequential_reader.h @@ -16,6 +16,7 @@ class RandomAccessFileReader; class Env; class Statistics; class Status; +class SystemClock; /** * BlobLogSequentialReader is a general purpose log stream reader @@ -35,7 +36,7 @@ class BlobLogSequentialReader { // Create a reader that will return log records from "*file_reader". BlobLogSequentialReader(std::unique_ptr&& file_reader, - Env* env, Statistics* statistics); + SystemClock* clock, Statistics* statistics); // No copying allowed BlobLogSequentialReader(const BlobLogSequentialReader&) = delete; @@ -63,7 +64,8 @@ class BlobLogSequentialReader { Status ReadSlice(uint64_t size, Slice* slice, char* buf); const std::unique_ptr file_; - Env* env_; + SystemClock* clock_; + Statistics* statistics_; Slice buffer_; diff --git a/db/blob/blob_log_writer.cc b/db/blob/blob_log_writer.cc index 8b3d0e2c736..2dabc98e802 100644 --- a/db/blob/blob_log_writer.cc +++ b/db/blob/blob_log_writer.cc @@ -11,7 +11,7 @@ #include "db/blob/blob_log_format.h" #include "file/writable_file_writer.h" #include "monitoring/statistics.h" -#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/stop_watch.h" @@ -19,14 +19,16 @@ namespace ROCKSDB_NAMESPACE { BlobLogWriter::BlobLogWriter(std::unique_ptr&& dest, - Env* env, Statistics* statistics, - uint64_t log_number, bool use_fs, uint64_t boffset) + SystemClock* clock, Statistics* statistics, + uint64_t log_number, bool use_fs, bool do_flush, + uint64_t boffset) : dest_(std::move(dest)), - env_(env), + clock_(clock), statistics_(statistics), log_number_(log_number), block_offset_(boffset), use_fsync_(use_fs), + do_flush_(do_flush), last_elem_type_(kEtNone) {} BlobLogWriter::~BlobLogWriter() = default; @@ -34,7 +36,7 @@ BlobLogWriter::~BlobLogWriter() = default; Status BlobLogWriter::Sync() { TEST_SYNC_POINT("BlobLogWriter::Sync"); - StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); + StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS); Status s = dest_->Sync(use_fsync_); RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED); return s; @@ -49,7 +51,9 @@ Status BlobLogWriter::WriteHeader(BlobLogHeader& header) { Status s = dest_->Append(Slice(str)); if (s.ok()) { block_offset_ += str.size(); - s = dest_->Flush(); + if (do_flush_) { + s = dest_->Flush(); + } } last_elem_type_ = kEtFileHdr; RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN, @@ -144,7 +148,7 @@ Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, const Slice& val, uint64_t* key_offset, uint64_t* blob_offset) { - StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); + StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS); Status s = dest_->Append(Slice(headerbuf)); if (s.ok()) { s = dest_->Append(key); @@ -152,7 +156,7 @@ Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf, if (s.ok()) { s = dest_->Append(val); } - if (s.ok()) { + if (do_flush_ && s.ok()) { s = dest_->Flush(); } diff --git a/db/blob/blob_log_writer.h b/db/blob/blob_log_writer.h index 0f9ea251642..c1f9f31ad00 100644 --- a/db/blob/blob_log_writer.h +++ b/db/blob/blob_log_writer.h @@ -9,7 +9,6 @@ #include #include "db/blob/blob_log_format.h" -#include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" @@ -18,7 +17,7 @@ namespace ROCKSDB_NAMESPACE { class WritableFileWriter; - +class SystemClock; /** * BlobLogWriter is the blob log stream writer. It provides an append-only * abstraction for writing blob data. @@ -32,9 +31,9 @@ class BlobLogWriter { // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this BlobLogWriter is in use. - BlobLogWriter(std::unique_ptr&& dest, Env* env, + BlobLogWriter(std::unique_ptr&& dest, SystemClock* clock, Statistics* statistics, uint64_t log_number, bool use_fsync, - uint64_t boffset = 0); + bool do_flush, uint64_t boffset = 0); // No copying allowed BlobLogWriter(const BlobLogWriter&) = delete; BlobLogWriter& operator=(const BlobLogWriter&) = delete; @@ -69,11 +68,12 @@ class BlobLogWriter { private: std::unique_ptr dest_; - Env* env_; + SystemClock* clock_; Statistics* statistics_; uint64_t log_number_; uint64_t block_offset_; // Current offset in block bool use_fsync_; + bool do_flush_; public: enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter }; diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc new file mode 100644 index 00000000000..feee834c51f --- /dev/null +++ b/db/blob/db_blob_basic_test.cc @@ -0,0 +1,517 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "db/blob/blob_index.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobBasicTest : public DBTestBase { + protected: + DBBlobBasicTest() + : DBTestBase("/db_blob_basic_test", /* env_do_fsync */ false) {} +}; + +TEST_F(DBBlobBasicTest, GetBlob) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get(key), blob_value); + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches; however, the blob itself can only be + // read from the blob file, so the read should return Incomplete. + ReadOptions read_options; + read_options.read_tier = kBlockCacheTier; + + PinnableSlice result; + ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result) + .IsIncomplete()); +} + +TEST_F(DBBlobBasicTest, MultiGetBlobs) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(Put(second_key, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(Put(third_key, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + + std::array keys{{first_key, second_key, third_key}}; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } + + // Try again with no I/O allowed. The table and the necessary blocks should + // already be in their respective caches. The first (inlined) value should be + // successfully read; however, the two blob values could only be read from the + // blob file, so for those the read should return Incomplete. + read_options.read_tier = kBlockCacheTier; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_TRUE(statuses[1].IsIncomplete()); + + ASSERT_TRUE(statuses[2].IsIncomplete()); + } +} + +TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + + // Fake a corrupt blob index. + const std::string blob_index("foobar"); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsCorruption()); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobBasicTest, GenerateIOTracing) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + std::string trace_file = dbname_ + "/io_trace_file"; + + Reopen(options); + { + // Create IO trace file + std::unique_ptr trace_writer; + ASSERT_OK( + NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer)); + ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer))); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + ASSERT_EQ(Get(key), blob_value); + + ASSERT_OK(db_->EndIOTrace()); + ASSERT_OK(env_->FileExists(trace_file)); + } + { + // Parse trace file to check file operations related to blob files are + // recorded. + std::unique_ptr trace_reader; + ASSERT_OK( + NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader)); + IOTraceReader reader(std::move(trace_reader)); + + IOTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(kMajorVersion, static_cast(header.rocksdb_major_version)); + ASSERT_EQ(kMinorVersion, static_cast(header.rocksdb_minor_version)); + + // Read records. + int blob_files_op_count = 0; + Status status; + while (true) { + IOTraceRecord record; + status = reader.ReadIOOp(&record); + if (!status.ok()) { + break; + } + if (record.file_name.find("blob") != std::string::npos) { + blob_files_op_count++; + } + } + // Assuming blob files will have Append, Close and then Read operations. + ASSERT_GT(blob_files_op_count, 2); + } +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + Reopen(options); + + ASSERT_OK(dbfull()->DisableFileDeletions()); + constexpr int kNumTableFiles = 2; + for (int i = 0; i < kNumTableFiles; ++i) { + for (char ch = 'a'; ch != 'c'; ++ch) { + std::string key(1, ch); + ASSERT_OK(Put(key, "value" + std::to_string(i))); + } + ASSERT_OK(Flush()); + } + + Close(); + + std::vector files; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + std::string blob_file_path; + uint64_t max_blob_file_num = kInvalidBlobFileNumber; + for (const auto& fname : files) { + uint64_t file_num = 0; + FileType type; + if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) && + type == kBlobFile) { + if (file_num > max_blob_file_num) { + max_blob_file_num = file_num; + blob_file_path = dbname_ + "/" + fname; + } + } + } + ASSERT_OK(env_->DeleteFile(blob_file_path)); + + options.best_efforts_recovery = true; + Reopen(options); + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "a", &value)); + ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value); +} + +TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) { + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key1", "v1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key1", "v3")); + ASSERT_OK(Flush()); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value)); + ASSERT_EQ(Get("Key1"), "v1,v2,v3"); +} + +TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { + constexpr size_t num_keys = 3; + + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + ASSERT_OK(Put("Key0", "v0_0")); + ASSERT_OK(Put("Key1", "v1_0")); + ASSERT_OK(Put("Key2", "v2_0")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_1")); + ASSERT_OK(Merge("Key1", "v1_1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("Key0", "v0_2")); + ASSERT_OK(Flush()); + + std::array keys{{"Key0", "Key1", "Key2"}}; + std::array values; + std::array statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], "v1_0,v1_1"); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v2_0"); +} + +class DBBlobBasicIOErrorTest : public DBBlobBasicTest, + public testing::WithParamInterface { + protected: + DBBlobBasicIOErrorTest() : sync_point_(GetParam()) { + fault_injection_env_.reset(new FaultInjectionTestEnv(env_)); + } + ~DBBlobBasicIOErrorTest() { Close(); } + + std::unique_ptr fault_injection_env_; + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest, + ::testing::ValuesIn(std::vector{ + "BlobFileReader::OpenFile:NewRandomAccessFile", + "BlobFileReader::GetBlob:ReadFromFile"})); + +TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) { + Options options; + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PinnableSlice result; + ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBBlobBasicIOErrorTest, MultiGetBlobs_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr size_t num_keys = 2; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + std::array keys{{first_key, second_key}}; + std::array values; + std::array statuses; + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(statuses[0].IsIOError()); + ASSERT_TRUE(statuses[1].IsIOError()); +} + +namespace { + +class ReadBlobCompactionFilter : public CompactionFilter { + public: + ReadBlobCompactionFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.read.blob"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const override { + if (value_type != CompactionFilter::ValueType::kValue) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + return CompactionFilter::Decision::kChangeValue; + } +}; + +} // anonymous namespace + +TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) { + Options options = GetDefaultOptions(); + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + std::unique_ptr compaction_filter_guard( + new ReadBlobCompactionFilter); + options.compaction_filter = compaction_filter_guard.get(); + + DestroyAndReopen(options); + constexpr char key[] = "foo"; + constexpr char blob_value[] = "foo_blob_value"; + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/blob/db_blob_compaction_test.cc b/db/blob/db_blob_compaction_test.cc new file mode 100644 index 00000000000..29f10f2e228 --- /dev/null +++ b/db/blob/db_blob_compaction_test.cc @@ -0,0 +1,399 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/blob/blob_index.h" +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" +#include "utilities/fault_injection_env.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCompactionTest : public DBTestBase { + public: + explicit DBBlobCompactionTest() + : DBTestBase("/db_blob_compaction_test", /*env_do_fsync=*/false) {} + +#ifndef ROCKSDB_LITE + const std::vector& GetCompactionStats() { + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + return internal_stats->TEST_GetCompactionStats(); + } +#endif // ROCKSDB_LITE +}; + +namespace { + +class FilterByKeyLength : public CompactionFilter { + public: + explicit FilterByKeyLength(size_t len) : length_threshold_(len) {} + const char* Name() const override { + return "rocksdb.compaction.filter.by.key.length"; + } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() < length_threshold_) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + private: + size_t length_threshold_; +}; + +class BadBlobCompactionFilter : public CompactionFilter { + public: + explicit BadBlobCompactionFilter(std::string prefix, + CompactionFilter::Decision filter_by_key, + CompactionFilter::Decision filter_v2) + : prefix_(std::move(prefix)), + filter_blob_by_key_(filter_by_key), + filter_v2_(filter_v2) {} + const char* Name() const override { return "rocksdb.compaction.filter.bad"; } + CompactionFilter::Decision FilterBlobByKey( + int /*level*/, const Slice& key, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + if (key.size() >= prefix_.size() && + 0 == strncmp(prefix_.data(), key.data(), prefix_.size())) { + return CompactionFilter::Decision::kUndetermined; + } + return filter_blob_by_key_; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return filter_v2_; + } + + private: + const std::string prefix_; + const CompactionFilter::Decision filter_blob_by_key_; + const CompactionFilter::Decision filter_v2_; +}; + +class ValueBlindWriteFilter : public CompactionFilter { + public: + explicit ValueBlindWriteFilter(std::string new_val) + : new_value_(std::move(new_val)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.blind.write"; + } + CompactionFilter::Decision FilterBlobByKey( + int level, const Slice& key, std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string new_value_; +}; + +CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey( + int /*level*/, const Slice& /*key*/, std::string* new_value, + std::string* /*skip_until*/) const { + assert(new_value); + new_value->assign(new_value_); + return CompactionFilter::Decision::kChangeValue; +} + +class ValueMutationFilter : public CompactionFilter { + public: + explicit ValueMutationFilter(std::string padding) + : padding_(std::move(padding)) {} + const char* Name() const override { + return "rocksdb.compaction.filter.value.mutation"; + } + CompactionFilter::Decision FilterV2(int level, const Slice& key, + ValueType value_type, + const Slice& existing_value, + std::string* new_value, + std::string* skip_until) const override; + + private: + const std::string padding_; +}; + +CompactionFilter::Decision ValueMutationFilter::FilterV2( + int /*level*/, const Slice& /*key*/, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* /*skip_until*/) const { + assert(CompactionFilter::ValueType::kBlobIndex != value_type); + if (CompactionFilter::ValueType::kValue != value_type) { + return CompactionFilter::Decision::kKeep; + } + assert(new_value); + new_value->assign(existing_value.data(), existing_value.size()); + new_value->append(padding_); + return CompactionFilter::Decision::kChangeValue; +} + +class AlwaysKeepFilter : public CompactionFilter { + public: + explicit AlwaysKeepFilter() = default; + const char* Name() const override { + return "rocksdb.compaction.filter.always.keep"; + } + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& /*key*/, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + return CompactionFilter::Decision::kKeep; + } +}; +} // anonymous namespace + +class DBBlobBadCompactionFilterTest + : public DBBlobCompactionTest, + public testing::WithParamInterface< + std::tuple> { + public: + explicit DBBlobBadCompactionFilterTest() + : compaction_filter_guard_(new BadBlobCompactionFilter( + std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam()))) {} + + protected: + std::unique_ptr compaction_filter_guard_; +}; + +INSTANTIATE_TEST_CASE_P( + BadCompactionFilter, DBBlobBadCompactionFilterTest, + testing::Combine( + testing::Values("a"), + testing::Values(CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError), + testing::Values(CompactionFilter::Decision::kUndetermined, + CompactionFilter::Decision::kChangeBlobIndex, + CompactionFilter::Decision::kIOError))); + +TEST_F(DBBlobCompactionTest, FilterByKeyLength) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr size_t kKeyLength = 2; + std::unique_ptr compaction_filter_guard( + new FilterByKeyLength(kKeyLength)); + options.compaction_filter = compaction_filter_guard.get(); + + constexpr char short_key[] = "a"; + constexpr char long_key[] = "abc"; + constexpr char blob_value[] = "value"; + + DestroyAndReopen(options); + ASSERT_OK(Put(short_key, blob_value)); + ASSERT_OK(Put(long_key, blob_value)); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr)); + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound()); + value.clear(); + ASSERT_OK(db_->Get(ReadOptions(), long_key, &value)); + ASSERT_EQ("value", value); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides between kKeep and kRemove solely based on key; + // this involves neither reading nor writing blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, BlindWriteFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + constexpr char new_blob_value[] = "new_blob_value"; + std::unique_ptr compaction_filter_guard( + new ValueBlindWriteFilter(new_blob_value)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector keys = {"a", "b", "c"}; + const std::vector values = {"a_value", "b_value", "c_value"}; + assert(keys.size() == values.size()); + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(Put(keys[i], values[i])); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& key : keys) { + ASSERT_EQ(new_blob_value, Get(key)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter unconditionally changes value in FilterBlobByKey; + // this involves writing but not reading blobs + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.compaction_filter = compaction_filter_guard_.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("b", "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); + + DestroyAndReopen(options); + std::string key(std::get<0>(GetParam())); + ASSERT_OK(Put(key, "value")); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsNotSupported()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + constexpr char key[] = "key"; + constexpr char blob[] = "blob"; + // Fake an inlined TTL blob index. + std::string blob_index; + constexpr uint64_t expiration = 1234567890; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilter) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + constexpr char padding[] = "_delta"; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter(padding)); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + const std::vector> kvs = { + {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}}; + for (const auto& kv : kvs) { + ASSERT_OK(Put(kv.first, kv.second)); + } + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + for (const auto& kv : kvs) { + ASSERT_EQ(kv.second + std::string(padding), Get(kv.first)); + } + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter changes the value using the previous value in FilterV2; + // this involves reading and writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new ValueMutationFilter("")); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + // Mock a corrupted blob index + constexpr char key[] = "key"; + std::string blob_idx("blob_idx"); + WriteBatch write_batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&write_batch, 0, key, blob_idx)); + ASSERT_OK(db_->Write(WriteOptions(), &write_batch)); + ASSERT_OK(Flush()); + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr) + .IsCorruption()); + Close(); +} + +TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + std::unique_ptr compaction_filter_guard( + new AlwaysKeepFilter()); + options.compaction_filter = compaction_filter_guard.get(); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Flush()); + std::vector blob_files = GetBlobFileNumbers(); + ASSERT_EQ(1, blob_files.size()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + ASSERT_EQ(blob_files, GetBlobFileNumbers()); + +#ifndef ROCKSDB_LITE + const auto& compaction_stats = GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + // Filter decides to keep the existing value in FilterV2; + // this involves reading but not writing blobs + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); +#endif // ROCKSDB_LITE + + Close(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/blob/db_blob_corruption_test.cc b/db/blob/db_blob_corruption_test.cc new file mode 100644 index 00000000000..77f11b75ab3 --- /dev/null +++ b/db/blob/db_blob_corruption_test.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class DBBlobCorruptionTest : public DBTestBase { + protected: + DBBlobCorruptionTest() + : DBTestBase("/db_blob_corruption_test", /* env_do_fsync */ false) {} + + void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { + // Pick file to corrupt + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + std::string fname; + uint64_t picked_number = kInvalidBlobFileNumber; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == filetype && + number > picked_number) { // Pick latest file + fname = dbname_ + "/" + filenames[i]; + picked_number = number; + } + } + ASSERT_TRUE(!fname.empty()) << filetype; + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + options.file_checksum_gen_factory = + ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); + Reopen(options); + + ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1"))); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2"))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + Close(); + + Corrupt(kBlobFile, 0, 2); + + ASSERT_OK(TryReopen(options)); + + int count{0}; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { + const Status* s = static_cast(arg); + ASSERT_NE(s, nullptr); + ++count; + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_EQ(1, count); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/blob/db_blob_index_test.cc b/db/blob/db_blob_index_test.cc index e7ceabd3e56..34bcd9fb584 100644 --- a/db/blob/db_blob_index_test.cc +++ b/db/blob/db_blob_index_test.cc @@ -73,6 +73,9 @@ class DBBlobIndexTest : public DBTestBase { if (s.IsNotFound()) { return "NOT_FOUND"; } + if (s.IsCorruption()) { + return "CORRUPTION"; + } if (s.IsNotSupported()) { return "NOT_SUPPORTED"; } @@ -95,11 +98,12 @@ class DBBlobIndexTest : public DBTestBase { ArenaWrappedDBIter* GetBlobIterator() { return dbfull()->NewIteratorImpl( ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), - nullptr /*read_callback*/, true /*allow_blob*/); + nullptr /*read_callback*/, true /*expose_blob_index*/); } Options GetTestOptions() { Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.num_levels = 2; options.disable_auto_compactions = true; @@ -153,8 +157,13 @@ TEST_F(DBBlobIndexTest, Write) { } } -// Get should be able to return blob index if is_blob_index is provided, -// otherwise return Status::NotSupported status. +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should be able to return blob index if is_blob_index is +// provided, otherwise it should return Status::NotSupported (when reading from +// memtable) or Status::Corruption (when reading from SST). Reading from SST +// returns Corruption because we can't differentiate between the application +// accidentally opening the base DB of a stacked BlobDB and actual corruption +// when using the integrated BlobDB. TEST_F(DBBlobIndexTest, Get) { for (auto tier : kAllTiers) { DestroyAndReopen(GetTestOptions()); @@ -171,15 +180,22 @@ TEST_F(DBBlobIndexTest, Get) { ASSERT_EQ("value", GetImpl("key", &is_blob_index)); ASSERT_FALSE(is_blob_index); // Verify blob index - ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); - ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); + if (tier <= kImmutableMemtables) { + ASSERT_TRUE(Get("blob_key", &value).IsNotSupported()); + ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key")); + } else { + ASSERT_TRUE(Get("blob_key", &value).IsCorruption()); + ASSERT_EQ("CORRUPTION", GetImpl("blob_key")); + } ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index)); ASSERT_TRUE(is_blob_index); } } -// Get should NOT return Status::NotSupported if blob index is updated with -// a normal value. +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. Get should NOT return Status::NotSupported/Status::Corruption +// if blob index is updated with a normal value. See the test case above for +// more details. TEST_F(DBBlobIndexTest, Updated) { for (auto tier : kAllTiers) { DestroyAndReopen(GetTestOptions()); @@ -206,7 +222,11 @@ TEST_F(DBBlobIndexTest, Updated) { ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot)); } ASSERT_EQ("new_value", Get("key1")); - ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); + if (tier <= kImmutableMemtables) { + ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2")); + } else { + ASSERT_EQ("CORRUPTION", GetImpl("key2")); + } ASSERT_EQ("NOT_FOUND", Get("key3")); ASSERT_EQ("NOT_FOUND", Get("key4")); ASSERT_EQ("a,b,c", GetImpl("key5")); @@ -218,8 +238,11 @@ TEST_F(DBBlobIndexTest, Updated) { } } -// Iterator should get blob value if allow_blob flag is set, -// otherwise return Status::NotSupported status. +// Note: the following test case pertains to the StackableDB-based BlobDB +// implementation. When a blob iterator is used, it should set the +// expose_blob_index flag for the underlying DBIter, and retrieve/return the +// corresponding blob value. If a regular DBIter is created (i.e. +// expose_blob_index is not set), it should return Status::Corruption. TEST_F(DBBlobIndexTest, Iterate) { const std::vector> data = { /*00*/ {kTypeValue}, @@ -282,6 +305,7 @@ TEST_F(DBBlobIndexTest, Iterate) { std::function extra_check = nullptr) { // Seek auto* iterator = create_iterator(); + ASSERT_OK(iterator->status()); ASSERT_OK(iterator->Refresh()); iterator->Seek(get_key(index)); check_iterator(iterator, expected_status, forward_value); @@ -295,6 +319,7 @@ TEST_F(DBBlobIndexTest, Iterate) { ASSERT_OK(iterator->Refresh()); iterator->Seek(get_key(index - 1)); ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); iterator->Next(); check_iterator(iterator, expected_status, forward_value); if (extra_check) { @@ -304,6 +329,7 @@ TEST_F(DBBlobIndexTest, Iterate) { // SeekForPrev iterator = create_iterator(); + ASSERT_OK(iterator->status()); ASSERT_OK(iterator->Refresh()); iterator->SeekForPrev(get_key(index)); check_iterator(iterator, expected_status, backward_value); @@ -316,6 +342,7 @@ TEST_F(DBBlobIndexTest, Iterate) { iterator = create_iterator(); iterator->Seek(get_key(index + 1)); ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); iterator->Prev(); check_iterator(iterator, expected_status, backward_value); if (extra_check) { @@ -353,7 +380,7 @@ TEST_F(DBBlobIndexTest, Iterate) { ASSERT_OK(Write(&batch)); break; default: - assert(false); + FAIL(); }; } snapshots.push_back(dbfull()->GetSnapshot()); @@ -364,15 +391,15 @@ TEST_F(DBBlobIndexTest, Iterate) { MoveDataTo(tier); // Normal iterator - verify(1, Status::kNotSupported, "", "", create_normal_iterator); - verify(3, Status::kNotSupported, "", "", create_normal_iterator); + verify(1, Status::kCorruption, "", "", create_normal_iterator); + verify(3, Status::kCorruption, "", "", create_normal_iterator); verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), create_normal_iterator); verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), create_normal_iterator); verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), create_normal_iterator); - verify(11, Status::kNotSupported, "", "", create_normal_iterator); + verify(11, Status::kCorruption, "", "", create_normal_iterator); verify(13, Status::kOk, get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), @@ -391,7 +418,11 @@ TEST_F(DBBlobIndexTest, Iterate) { create_blob_iterator, check_is_blob(false)); verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), create_blob_iterator, check_is_blob(false)); - verify(11, Status::kNotSupported, "", "", create_blob_iterator); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } verify(13, Status::kOk, get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), @@ -413,7 +444,11 @@ TEST_F(DBBlobIndexTest, Iterate) { create_blob_iterator, check_is_blob(false)); verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), create_blob_iterator, check_is_blob(false)); - verify(11, Status::kNotSupported, "", "", create_blob_iterator); + if (tier <= kImmutableMemtables) { + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + } else { + verify(11, Status::kCorruption, "", "", create_blob_iterator); + } verify(13, Status::kOk, get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), @@ -428,6 +463,106 @@ TEST_F(DBBlobIndexTest, Iterate) { } } +TEST_F(DBBlobIndexTest, IntegratedBlobIterate) { + const std::vector> data = { + /*00*/ {"Put"}, + /*01*/ {"Put", "Merge", "Merge", "Merge"}, + /*02*/ {"Put"}}; + + auto get_key = [](size_t index) { return ("key" + std::to_string(index)); }; + + auto get_value = [&](size_t index, size_t version) { + return get_key(index) + "_value" + ToString(version); + }; + + auto check_iterator = [&](Iterator* iterator, Status expected_status, + const Slice& expected_value) { + ASSERT_EQ(expected_status, iterator->status()); + if (expected_status.ok()) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_EQ(expected_value, iterator->value()); + } else { + ASSERT_FALSE(iterator->Valid()); + } + }; + + auto verify = [&](size_t index, Status expected_status, + const Slice& expected_value) { + // Seek + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Next + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->Refresh()); + iterator->Seek(get_key(index - 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Next(); + check_iterator(iterator, expected_status, expected_value); + } + // SeekForPrev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + ASSERT_OK(iterator->status()); + ASSERT_OK(iterator->Refresh()); + iterator->SeekForPrev(get_key(index)); + check_iterator(iterator, expected_status, expected_value); + } + // Prev + { + Iterator* iterator = db_->NewIterator(ReadOptions()); + std::unique_ptr iterator_guard(iterator); + iterator->Seek(get_key(index + 1)); + ASSERT_TRUE(iterator->Valid()); + ASSERT_OK(iterator->status()); + iterator->Prev(); + check_iterator(iterator, expected_status, expected_value); + } + }; + + Options options = GetTestOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + DestroyAndReopen(options); + + // fill data + for (size_t i = 0; i < data.size(); i++) { + for (size_t j = 0; j < data[i].size(); j++) { + std::string key = get_key(i); + std::string value = get_value(i, j); + if (data[i][j] == "Put") { + ASSERT_OK(Put(key, value)); + ASSERT_OK(Flush()); + } else if (data[i][j] == "Merge") { + ASSERT_OK(Merge(key, value)); + ASSERT_OK(Flush()); + } + } + } + + std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," + + get_value(1, 2) + "," + get_value(1, 3); + Status expected_status; + verify(1, expected_status, expected_value); + +#ifndef ROCKSDB_LITE + // Test DBIter::FindValueForCurrentKeyUsingSeek flow. + ASSERT_OK(dbfull()->SetOptions(cfh(), + {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, expected_status, expected_value); +#endif // !ROCKSDB_LITE +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/builder.cc b/db/builder.cc index 90cfbbffbb7..88c99a08415 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -23,11 +23,13 @@ #include "db/range_del_aggregator.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "file/file_util.h" #include "file/filename.h" #include "file/read_write_util.h" #include "file/writable_file_writer.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_util.h" +#include "options/options_helper.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -43,62 +45,41 @@ namespace ROCKSDB_NAMESPACE { class TableFactory; -TableBuilder* NewTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, - WritableFileWriter* file, const CompressionType compression_type, - uint64_t sample_for_compression, const CompressionOptions& compression_opts, - int level, const bool skip_filters, const uint64_t creation_time, - const uint64_t oldest_key_time, const uint64_t target_file_size, - const uint64_t file_creation_time, const std::string& db_id, - const std::string& db_session_id) { - assert((column_family_id == +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file) { + assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == - column_family_name.empty()); - return ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, internal_comparator, - int_tbl_prop_collector_factories, compression_type, - sample_for_compression, compression_opts, - skip_filters, column_family_name, level, - creation_time, oldest_key_time, target_file_size, - file_creation_time, db_id, db_session_id), - column_family_id, file); + tboptions.column_family_name.empty()); + return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); } Status BuildTable( - const std::string& dbname, VersionSet* versions, Env* env, FileSystem* fs, - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, const FileOptions& file_options, - TableCache* table_cache, InternalIterator* iter, + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, std::vector snapshots, SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, const CompressionType compression, - uint64_t sample_for_compression, const CompressionOptions& compression_opts, - bool paranoid_file_checks, InternalStats* internal_stats, - TableFileCreationReason reason, IOStatus* io_status, + SnapshotChecker* snapshot_checker, bool paranoid_file_checks, + InternalStats* internal_stats, IOStatus* io_status, const std::shared_ptr& io_tracer, EventLogger* event_logger, int job_id, const Env::IOPriority io_priority, - TableProperties* table_properties, int level, const uint64_t creation_time, - const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint, - const uint64_t file_creation_time, const std::string& db_id, - const std::string& db_session_id) { - assert((column_family_id == + TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, + const std::string* full_history_ts_low, + BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries, + uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { + assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == - column_family_name.empty()); + tboptions.column_family_name.empty()); + auto& mutable_cf_options = tboptions.moptions; + auto& ioptions = tboptions.ioptions; // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; OutputValidator output_validator( - internal_comparator, + tboptions.internal_comparator, /*enable_order_check=*/ mutable_cf_options.check_flush_compaction_key_order, /*enable_hash=*/paranoid_file_checks); @@ -106,8 +87,15 @@ Status BuildTable( meta->fd.file_size = 0; iter->SeekToFirst(); std::unique_ptr range_del_agg( - new CompactionRangeDelAggregator(&internal_comparator, snapshots)); + new CompactionRangeDelAggregator(&tboptions.internal_comparator, + snapshots)); + uint64_t num_unfragmented_tombstones = 0; + uint64_t total_tombstone_payload_bytes = 0; for (auto& range_del_iter : range_del_iters) { + num_unfragmented_tombstones += + range_del_iter->num_unfragmented_tombstones(); + total_tombstone_payload_bytes += + range_del_iter->total_tombstone_payload_bytes(); range_del_agg->AddTombstones(std::move(range_del_iter)); } @@ -117,18 +105,39 @@ Status BuildTable( std::string file_checksum = kUnknownFileChecksum; std::string file_checksum_func_name = kUnknownFileChecksumFuncName; #ifndef ROCKSDB_LITE - EventHelpers::NotifyTableFileCreationStarted( - ioptions.listeners, dbname, column_family_name, fname, job_id, reason); + EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname, + tboptions.column_family_name, + fname, job_id, tboptions.reason); #endif // !ROCKSDB_LITE + Env* env = db_options.env; + assert(env); + FileSystem* fs = db_options.fs.get(); + assert(fs); + TableProperties tp; if (iter->Valid() || !range_del_agg->IsEmpty()) { + std::unique_ptr compaction_filter; + if (ioptions.compaction_filter_factory != nullptr && + ioptions.compaction_filter_factory->ShouldFilterTableFileCreation( + tboptions.reason)) { + CompactionFilter::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = false; + context.column_family_id = tboptions.column_family_id; + context.reason = tboptions.reason; + compaction_filter = + ioptions.compaction_filter_factory->CreateCompactionFilter(context); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + s.PermitUncheckedError(); + return Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + } + } + TableBuilder* builder; std::unique_ptr file_writer; - // Currently we only enable dictionary compression during compaction to the - // bottommost level. - CompressionOptions compression_opts_for_flush(compression_opts); - compression_opts_for_flush.max_dict_bytes = 0; - compression_opts_for_flush.zstd_max_train_bytes = 0; { std::unique_ptr file; #ifndef NDEBUG @@ -143,49 +152,51 @@ Status BuildTable( } if (!s.ok()) { EventHelpers::LogAndNotifyTableFileCreationFinished( - event_logger, ioptions.listeners, dbname, column_family_name, fname, - job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s, - file_checksum, file_checksum_func_name); + event_logger, ioptions.listeners, dbname, + tboptions.column_family_name, fname, job_id, meta->fd, + kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum, + file_checksum_func_name); return s; } + FileTypeSet tmp_set = ioptions.checksum_handoff_file_types; file->SetIOPriority(io_priority); file->SetWriteLifeTimeHint(write_hint); - file_writer.reset(new WritableFileWriter( - std::move(file), fname, file_options, env, io_tracer, - ioptions.statistics, ioptions.listeners, - ioptions.file_checksum_gen_factory)); - - builder = NewTableBuilder( - ioptions, mutable_cf_options, internal_comparator, - int_tbl_prop_collector_factories, column_family_id, - column_family_name, file_writer.get(), compression, - sample_for_compression, compression_opts_for_flush, level, - false /* skip_filters */, creation_time, oldest_key_time, - 0 /*target_file_size*/, file_creation_time, db_id, db_session_id); + std::move(file), fname, file_options, ioptions.clock, io_tracer, + ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile))); + + builder = NewTableBuilder(tboptions, file_writer.get()); } - MergeHelper merge(env, internal_comparator.user_comparator(), - ioptions.merge_operator, nullptr, ioptions.info_log, - true /* internal key corruption is not ok */, - snapshots.empty() ? 0 : snapshots.back(), - snapshot_checker); + MergeHelper merge( + env, tboptions.internal_comparator.user_comparator(), + ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger, + true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); std::unique_ptr blob_file_builder( (mutable_cf_options.enable_blob_files && blob_file_additions) - ? new BlobFileBuilder(versions, env, fs, &ioptions, - &mutable_cf_options, &file_options, job_id, - column_family_id, column_family_name, - io_priority, write_hint, &blob_file_paths, - blob_file_additions) + ? new BlobFileBuilder(versions, fs, &ioptions, &mutable_cf_options, + &file_options, job_id, + tboptions.column_family_id, + tboptions.column_family_name, io_priority, + write_hint, io_tracer, blob_callback, + &blob_file_paths, blob_file_additions) : nullptr); CompactionIterator c_iter( - iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber, - &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, - ShouldReportDetailedTime(env, ioptions.statistics), + iter, tboptions.internal_comparator.user_comparator(), &merge, + kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, + snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats), true /* internal key corruption is not ok */, range_del_agg.get(), - blob_file_builder.get(), ioptions.allow_data_in_errors); + blob_file_builder.get(), ioptions.allow_data_in_errors, + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, + /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, db_options.info_log, + full_history_ts_low); c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { @@ -212,6 +223,7 @@ Status BuildTable( } else if (!c_iter.status().ok()) { s = c_iter.status(); } + if (s.ok()) { auto range_del_it = range_del_agg->NewIterator(); for (range_del_it->SeekToFirst(); range_del_it->Valid(); @@ -220,16 +232,17 @@ Status BuildTable( auto kv = tombstone.Serialize(); builder->Add(kv.first.Encode(), kv.second); meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), - tombstone.seq_, internal_comparator); - } - - if (blob_file_builder) { - s = blob_file_builder->Finish(); + tombstone.seq_, + tboptions.internal_comparator); } } TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); const bool empty = builder->IsEmpty(); + if (num_input_entries != nullptr) { + *num_input_entries = + c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + } if (!s.ok() || empty) { builder->Abandon(); } else { @@ -245,6 +258,25 @@ Status BuildTable( meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); tp = builder->GetTableProperties(); // refresh now that builder is finished + if (memtable_payload_bytes != nullptr && + memtable_garbage_bytes != nullptr) { + const CompactionIterationStats& ci_stats = c_iter.iter_stats(); + uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes + + ci_stats.total_input_raw_value_bytes + + total_tombstone_payload_bytes; + uint64_t total_payload_bytes_written = + (tp.raw_key_size + tp.raw_value_size); + // Prevent underflow, which may still happen at this point + // since we only support inserts, deletes, and deleteRanges. + if (total_payload_bytes_written <= total_payload_bytes) { + *memtable_payload_bytes = total_payload_bytes; + *memtable_garbage_bytes = + total_payload_bytes - total_payload_bytes_written; + } else { + *memtable_payload_bytes = 0; + *memtable_garbage_bytes = 0; + } + } if (table_properties) { *table_properties = tp; } @@ -254,7 +286,7 @@ Status BuildTable( // Finish and check for file errors TEST_SYNC_POINT("BuildTable:BeforeSyncTable"); if (s.ok() && !empty) { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); + StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS); *io_status = file_writer->Sync(ioptions.use_fsync); } TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile"); @@ -273,6 +305,15 @@ Status BuildTable( s = *io_status; } + if (blob_file_builder) { + if (s.ok()) { + s = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(); + } + blob_file_builder.reset(); + } + // TODO Also check the IO status when create the Iterator. if (s.ok() && !empty) { @@ -284,20 +325,20 @@ Status BuildTable( // to cache it here for further user reads ReadOptions read_options; std::unique_ptr it(table_cache->NewIterator( - read_options, file_options, internal_comparator, *meta, + read_options, file_options, tboptions.internal_comparator, *meta, nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor.get(), nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), TableReaderCaller::kFlush, /*arena=*/nullptr, - /*skip_filter=*/false, level, + /*skip_filter=*/false, tboptions.level_at_creation, MaxFileSizeForL0MetaPin(mutable_cf_options), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key*/ nullptr, /*allow_unprepared_value*/ false)); s = it->status(); if (s.ok() && paranoid_file_checks) { - OutputValidator file_validator(internal_comparator, + OutputValidator file_validator(tboptions.internal_comparator, /*enable_order_check=*/true, /*enable_hash=*/true); for (it->SeekToFirst(); it->Valid(); it->Next()) { @@ -318,6 +359,8 @@ Status BuildTable( } if (!s.ok() || meta->fd.GetFileSize() == 0) { + TEST_SYNC_POINT("BuildTable:BeforeDeleteFile"); + constexpr IODebugContext* dbg = nullptr; Status ignored = fs->DeleteFile(fname, IOOptions(), dbg); @@ -327,11 +370,11 @@ Status BuildTable( if (blob_file_additions) { for (const std::string& blob_file_path : blob_file_paths) { - ignored = fs->DeleteFile(blob_file_path, IOOptions(), dbg); + ignored = DeleteDBFile(&db_options, blob_file_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); ignored.PermitUncheckedError(); + TEST_SYNC_POINT("BuildTable::AfterDeleteFile"); } - - blob_file_additions->clear(); } } @@ -340,9 +383,9 @@ Status BuildTable( } // Output to event logger and fire events. EventHelpers::LogAndNotifyTableFileCreationFinished( - event_logger, ioptions.listeners, dbname, column_family_name, fname, - job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s, - file_checksum, file_checksum_func_name); + event_logger, ioptions.listeners, dbname, tboptions.column_family_name, + fname, job_id, meta->fd, meta->oldest_blob_file_number, tp, + tboptions.reason, s, file_checksum, file_checksum_func_name); return s; } diff --git a/db/builder.h b/db/builder.h index 8c80c637955..f8828f5c448 100644 --- a/db/builder.h +++ b/db/builder.h @@ -24,37 +24,20 @@ namespace ROCKSDB_NAMESPACE { -struct Options; struct FileMetaData; class VersionSet; -class Env; -struct EnvOptions; class BlobFileAddition; -class Iterator; class SnapshotChecker; class TableCache; -class VersionEdit; class TableBuilder; class WritableFileWriter; class InternalStats; +class BlobFileCompletionCallback; -// @param column_family_name Name of the column family that is also identified -// by column_family_id, or empty string if unknown. It must outlive the -// TableBuilder returned by this function. -TableBuilder* NewTableBuilder( - const ImmutableCFOptions& options, const MutableCFOptions& moptions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, - WritableFileWriter* file, const CompressionType compression_type, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, int level, - const bool skip_filters = false, const uint64_t creation_time = 0, - const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0, - const uint64_t file_creation_time = 0, const std::string& db_id = "", - const std::string& db_session_id = ""); +// Convenience function for NewTableBuilder on the embedded table_factory. +TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, + WritableFileWriter* file); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of @@ -65,30 +48,26 @@ TableBuilder* NewTableBuilder( // @param column_family_name Name of the column family that is also identified // by column_family_id, or empty string if unknown. extern Status BuildTable( - const std::string& dbname, VersionSet* versions, Env* env, FileSystem* fs, - const ImmutableCFOptions& options, - const MutableCFOptions& mutable_cf_options, const FileOptions& file_options, - TableCache* table_cache, InternalIterator* iter, + const std::string& dbname, VersionSet* versions, + const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, + const FileOptions& file_options, TableCache* table_cache, + InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, const std::string& column_family_name, std::vector snapshots, SequenceNumber earliest_write_conflict_snapshot, - SnapshotChecker* snapshot_checker, const CompressionType compression, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, bool paranoid_file_checks, - InternalStats* internal_stats, TableFileCreationReason reason, - IOStatus* io_status, const std::shared_ptr& io_tracer, + SnapshotChecker* snapshot_checker, bool paranoid_file_checks, + InternalStats* internal_stats, IOStatus* io_status, + const std::shared_ptr& io_tracer, EventLogger* event_logger = nullptr, int job_id = 0, const Env::IOPriority io_priority = Env::IO_HIGH, - TableProperties* table_properties = nullptr, int level = -1, - const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, + TableProperties* table_properties = nullptr, Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, - const uint64_t file_creation_time = 0, const std::string& db_id = "", - const std::string& db_session_id = ""); + const std::string* full_history_ts_low = nullptr, + BlobFileCompletionCallback* blob_callback = nullptr, + uint64_t* num_input_entries = nullptr, + uint64_t* memtable_payload_bytes = nullptr, + uint64_t* memtable_garbage_bytes = nullptr); } // namespace ROCKSDB_NAMESPACE diff --git a/db/c.cc b/db/c.cc index e196d0177e0..79fa5181d85 100644 --- a/db/c.cc +++ b/db/c.cc @@ -11,7 +11,11 @@ #include "rocksdb/c.h" -#include +#include +#include +#include +#include + #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -24,6 +28,7 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" +#include "rocksdb/perf_context.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" @@ -35,17 +40,13 @@ #include "rocksdb/utilities/db_ttl.h" #include "rocksdb/utilities/memory_util.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/table_properties_collectors.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_batch.h" -#include "rocksdb/perf_context.h" #include "utilities/merge_operators.h" -#include -#include -#include - using ROCKSDB_NAMESPACE::BackupableDBOptions; using ROCKSDB_NAMESPACE::BackupEngine; using ROCKSDB_NAMESPACE::BackupID; @@ -60,7 +61,6 @@ using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; using ROCKSDB_NAMESPACE::ColumnFamilyHandle; using ROCKSDB_NAMESPACE::ColumnFamilyOptions; using ROCKSDB_NAMESPACE::CompactionFilter; -using ROCKSDB_NAMESPACE::CompactionFilterContext; using ROCKSDB_NAMESPACE::CompactionFilterFactory; using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; using ROCKSDB_NAMESPACE::CompactRangeOptions; @@ -80,10 +80,12 @@ using ROCKSDB_NAMESPACE::IngestExternalFileOptions; using ROCKSDB_NAMESPACE::Iterator; using ROCKSDB_NAMESPACE::LiveFileMetaData; using ROCKSDB_NAMESPACE::Logger; +using ROCKSDB_NAMESPACE::LRUCacheOptions; +using ROCKSDB_NAMESPACE::MemoryAllocator; using ROCKSDB_NAMESPACE::MemoryUtil; using ROCKSDB_NAMESPACE::MergeOperator; -using ROCKSDB_NAMESPACE::MergeOperators; using ROCKSDB_NAMESPACE::NewBloomFilterPolicy; +using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory; using ROCKSDB_NAMESPACE::NewGenericRateLimiter; using ROCKSDB_NAMESPACE::NewLRUCache; using ROCKSDB_NAMESPACE::OptimisticTransactionDB; @@ -104,6 +106,7 @@ using ROCKSDB_NAMESPACE::SliceTransform; using ROCKSDB_NAMESPACE::Snapshot; using ROCKSDB_NAMESPACE::SstFileWriter; using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory; using ROCKSDB_NAMESPACE::Transaction; using ROCKSDB_NAMESPACE::TransactionDB; using ROCKSDB_NAMESPACE::TransactionDBOptions; @@ -115,10 +118,8 @@ using ROCKSDB_NAMESPACE::WriteBatch; using ROCKSDB_NAMESPACE::WriteBatchWithIndex; using ROCKSDB_NAMESPACE::WriteOptions; -using std::shared_ptr; using std::vector; using std::unordered_set; -using std::map; extern "C" { @@ -154,6 +155,12 @@ struct rocksdb_filelock_t { FileLock* rep; }; struct rocksdb_logger_t { std::shared_ptr rep; }; +struct rocksdb_lru_cache_options_t { + LRUCacheOptions rep; +}; +struct rocksdb_memory_allocator_t { + std::shared_ptr rep; +}; struct rocksdb_cache_t { std::shared_ptr rep; }; @@ -181,6 +188,9 @@ struct rocksdb_transaction_options_t { struct rocksdb_transaction_t { Transaction* rep; }; +struct rocksdb_backupable_db_options_t { + BackupableDBOptions rep; +}; struct rocksdb_checkpoint_t { Checkpoint* rep; }; @@ -549,6 +559,18 @@ rocksdb_backup_engine_t* rocksdb_backup_engine_open( return result; } +rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts( + const rocksdb_backupable_db_options_t* options, rocksdb_env_t* env, + char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr) { @@ -645,6 +667,128 @@ void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) { delete be; } +rocksdb_backupable_db_options_t* rocksdb_backupable_db_options_create( + const char* backup_dir) { + return new rocksdb_backupable_db_options_t{ + BackupableDBOptions(std::string(backup_dir))}; +} + +void rocksdb_backupable_db_options_set_backup_dir( + rocksdb_backupable_db_options_t* options, const char* backup_dir) { + options->rep.backup_dir = std::string(backup_dir); +} + +void rocksdb_backupable_db_options_set_env( + rocksdb_backupable_db_options_t* options, rocksdb_env_t* env) { + options->rep.backup_env = (env ? env->rep : nullptr); +} + +void rocksdb_backupable_db_options_set_share_table_files( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.share_table_files = val; +} + +unsigned char rocksdb_backupable_db_options_get_share_table_files( + rocksdb_backupable_db_options_t* options) { + return options->rep.share_table_files; +} + +void rocksdb_backupable_db_options_set_sync( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.sync = val; +} + +unsigned char rocksdb_backupable_db_options_get_sync( + rocksdb_backupable_db_options_t* options) { + return options->rep.sync; +} + +void rocksdb_backupable_db_options_set_destroy_old_data( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.destroy_old_data = val; +} + +unsigned char rocksdb_backupable_db_options_get_destroy_old_data( + rocksdb_backupable_db_options_t* options) { + return options->rep.destroy_old_data; +} + +void rocksdb_backupable_db_options_set_backup_log_files( + rocksdb_backupable_db_options_t* options, unsigned char val) { + options->rep.backup_log_files = val; +} + +unsigned char rocksdb_backupable_db_options_get_backup_log_files( + rocksdb_backupable_db_options_t* options) { + return options->rep.backup_log_files; +} + +void rocksdb_backupable_db_options_set_backup_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit) { + options->rep.backup_rate_limit = limit; +} + +uint64_t rocksdb_backupable_db_options_get_backup_rate_limit( + rocksdb_backupable_db_options_t* options) { + return options->rep.backup_rate_limit; +} + +void rocksdb_backupable_db_options_set_restore_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit) { + options->rep.restore_rate_limit = limit; +} + +uint64_t rocksdb_backupable_db_options_get_restore_rate_limit( + rocksdb_backupable_db_options_t* options) { + return options->rep.restore_rate_limit; +} + +void rocksdb_backupable_db_options_set_max_background_operations( + rocksdb_backupable_db_options_t* options, int val) { + options->rep.max_background_operations = val; +} + +int rocksdb_backupable_db_options_get_max_background_operations( + rocksdb_backupable_db_options_t* options) { + return options->rep.max_background_operations; +} + +void rocksdb_backupable_db_options_set_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options, uint64_t size) { + options->rep.callback_trigger_interval_size = size; +} + +uint64_t rocksdb_backupable_db_options_get_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options) { + return options->rep.callback_trigger_interval_size; +} + +void rocksdb_backupable_db_options_set_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options, int val) { + options->rep.max_valid_backups_to_open = val; +} + +int rocksdb_backupable_db_options_get_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options) { + return options->rep.max_valid_backups_to_open; +} + +void rocksdb_backupable_db_options_set_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options, int val) { + options->rep.share_files_with_checksum_naming = + static_cast(val); +} + +int rocksdb_backupable_db_options_get_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options) { + return static_cast(options->rep.share_files_with_checksum_naming); +} + +void rocksdb_backupable_db_options_destroy( + rocksdb_backupable_db_options_t* options) { + delete options; +} + rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr) { Checkpoint* checkpoint; @@ -1255,34 +1399,39 @@ char* rocksdb_property_value_cf( } } -void rocksdb_approximate_sizes( - rocksdb_t* db, - int num_ranges, - const char* const* range_start_key, const size_t* range_start_key_len, - const char* const* range_limit_key, const size_t* range_limit_key_len, - uint64_t* sizes) { +void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges, + const char* const* range_start_key, + const size_t* range_start_key_len, + const char* const* range_limit_key, + const size_t* range_limit_key_len, + uint64_t* sizes, char** errptr) { Range* ranges = new Range[num_ranges]; for (int i = 0; i < num_ranges; i++) { ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); } - db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } delete[] ranges; } void rocksdb_approximate_sizes_cf( - rocksdb_t* db, - rocksdb_column_family_handle_t* column_family, - int num_ranges, - const char* const* range_start_key, const size_t* range_start_key_len, - const char* const* range_limit_key, const size_t* range_limit_key_len, - uint64_t* sizes) { + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + int num_ranges, const char* const* range_start_key, + const size_t* range_start_key_len, const char* const* range_limit_key, + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) { Range* ranges = new Range[num_ranges]; for (int i = 0; i < num_ranges; i++) { ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); } - db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes); + Status s = db->rep->GetApproximateSizes(column_family->rep, ranges, + num_ranges, sizes); + if (!s.ok()) { + SaveError(errptr, s); + } delete[] ranges; } @@ -1363,6 +1512,10 @@ void rocksdb_flush_cf( SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); } +void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) { + SaveError(errptr, db->rep->FlushWAL(sync)); +} + void rocksdb_disable_file_deletions( rocksdb_t* db, char** errptr) { @@ -2544,6 +2697,59 @@ unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( return opt->rep.skip_checking_sst_file_sizes_on_db_open; } +/* Blob Options Settings */ +void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_files = val; +} +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt) { + return opt->rep.enable_blob_files; +} + +void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.min_blob_size = val; +} + +uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) { + return opt->rep.min_blob_size; +} + +void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) { + opt->rep.blob_file_size = val; +} + +uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) { + return opt->rep.blob_file_size; +} + +void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt, + int val) { + opt->rep.blob_compression_type = static_cast(val); +} + +int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) { + return opt->rep.blob_compression_type; +} + +void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt, + unsigned char val) { + opt->rep.enable_blob_garbage_collection = val; +} + +unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) { + return opt->rep.enable_blob_garbage_collection; +} + +void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_age_cutoff = val; +} + +double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_age_cutoff; +} + void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } @@ -2636,6 +2842,14 @@ void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( opt->rep.bottommost_compression_opts.enabled = enabled; } +void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes, + unsigned char enabled) { + opt->rep.bottommost_compression_opts.max_dict_buffer_bytes = + max_dict_buffer_bytes; + opt->rep.bottommost_compression_opts.enabled = enabled; +} + void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits, int level, int strategy, int max_dict_bytes) { @@ -2650,6 +2864,31 @@ void rocksdb_options_set_compression_options_zstd_max_train_bytes( opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes; } +int rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.zstd_max_train_bytes; +} + +void rocksdb_options_set_compression_options_parallel_threads( + rocksdb_options_t* opt, int value) { + opt->rep.compression_opts.parallel_threads = value; +} + +int rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.parallel_threads; +} + +void rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) { + opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes; +} + +uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt) { + return opt->rep.compression_opts.max_dict_buffer_bytes; +} + void rocksdb_options_set_prefix_extractor( rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) { opt->rep.prefix_extractor.reset(prefix_extractor); @@ -2808,6 +3047,8 @@ void rocksdb_options_set_access_hint_on_compaction_start( opt->rep.access_hint_on_compaction_start = ROCKSDB_NAMESPACE::Options::WILLNEED; break; + default: + assert(0); } } @@ -3262,6 +3503,15 @@ unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) { return opt->rep.atomic_flush; } +void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt, + unsigned char manual_wal_flush) { + opt->rep.manual_wal_flush = manual_wal_flush; +} + +unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) { + return opt->rep.manual_wal_flush; +} + rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, @@ -3283,6 +3533,14 @@ void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cach } } +void rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) { + std::shared_ptr + compact_on_del = + NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + void rocksdb_set_perf_level(int v) { PerfLevel level = static_cast(v); SetPerfLevel(level); @@ -3798,6 +4056,25 @@ unsigned char rocksdb_readoptions_get_ignore_range_deletions( return opt->rep.ignore_range_deletions; } +void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.deadline = std::chrono::microseconds(microseconds); +} + +uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) { + return opt->rep.deadline.count(); +} + +void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt, + uint64_t microseconds) { + opt->rep.io_timeout = std::chrono::microseconds(microseconds); +} + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) { + return opt->rep.io_timeout.count(); +} + rocksdb_writeoptions_t* rocksdb_writeoptions_create() { return new rocksdb_writeoptions_t; } @@ -3930,16 +4207,58 @@ unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) { return opt->rep.wait; } +rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create( + char** errptr) { + rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t; + ROCKSDB_NAMESPACE::JemallocAllocatorOptions options; + SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator( + options, &allocator->rep)); + return allocator; +} + +void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) { + delete allocator; +} + +rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() { + return new rocksdb_lru_cache_options_t; +} + +void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) { + delete opt; +} + +void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt, + size_t capacity) { + opt->rep.capacity = capacity; +} + +void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) { + opt->rep.memory_allocator = allocator->rep; +} + rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) { rocksdb_cache_t* c = new rocksdb_cache_t; c->rep = NewLRUCache(capacity); return c; } +rocksdb_cache_t* rocksdb_cache_create_lru_opts( + rocksdb_lru_cache_options_t* opt) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = NewLRUCache(opt->rep); + return c; +} + void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } +void rocksdb_cache_disown_data(rocksdb_cache_t* cache) { + cache->rep->DisownData(); +} + void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { cache->rep->SetCapacity(capacity); } @@ -3985,20 +4304,36 @@ void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n); } +int rocksdb_env_get_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(); +} + void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n, Env::BOTTOM); } +int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::BOTTOM); +} + void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n, Env::HIGH); } +int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::HIGH); +} + void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env, int n) { env->rep->SetBackgroundThreads(n, Env::LOW); } +int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) { + return env->rep->GetBackgroundThreads(Env::LOW); +} + void rocksdb_env_join_all_threads(rocksdb_env_t* env) { env->rep->WaitForJoin(); } @@ -4226,32 +4561,62 @@ void rocksdb_universal_compaction_options_set_size_ratio( uco->rep->size_ratio = ratio; } +int rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->size_ratio; +} + void rocksdb_universal_compaction_options_set_min_merge_width( rocksdb_universal_compaction_options_t* uco, int w) { uco->rep->min_merge_width = w; } +int rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->min_merge_width; +} + void rocksdb_universal_compaction_options_set_max_merge_width( rocksdb_universal_compaction_options_t* uco, int w) { uco->rep->max_merge_width = w; } +int rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_merge_width; +} + void rocksdb_universal_compaction_options_set_max_size_amplification_percent( rocksdb_universal_compaction_options_t* uco, int p) { uco->rep->max_size_amplification_percent = p; } +int rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->max_size_amplification_percent; +} + void rocksdb_universal_compaction_options_set_compression_size_percent( rocksdb_universal_compaction_options_t* uco, int p) { uco->rep->compression_size_percent = p; } +int rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t* uco) { + return uco->rep->compression_size_percent; +} + void rocksdb_universal_compaction_options_set_stop_style( rocksdb_universal_compaction_options_t* uco, int style) { uco->rep->stop_style = static_cast(style); } +int rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t* uco) { + return static_cast(uco->rep->stop_style); +} + void rocksdb_universal_compaction_options_destroy( rocksdb_universal_compaction_options_t* uco) { delete uco->rep; @@ -4269,6 +4634,11 @@ void rocksdb_fifo_compaction_options_set_max_table_files_size( fifo_opts->rep.max_table_files_size = size; } +uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.max_table_files_size; +} + void rocksdb_fifo_compaction_options_destroy( rocksdb_fifo_compaction_options_t* fifo_opts) { delete fifo_opts; @@ -4567,7 +4937,10 @@ void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) { const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot( rocksdb_transaction_t* txn) { - rocksdb_snapshot_t* result = new rocksdb_snapshot_t; + // This will be freed later on using free, so use malloc here to avoid a + // mismatch + rocksdb_snapshot_t* result = + (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t)); result->rep = txn->rep->GetSnapshot(); return result; } diff --git a/db/c_test.c b/db/c_test.c index a01336738b3..5b7459b069f 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -517,6 +517,9 @@ int main(int argc, char** argv) { coptions = rocksdb_compactoptions_create(); rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1); + rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000, + 10001); + StartPhase("destroy"); rocksdb_destroy_db(options, dbname, &err); Free(&err); @@ -988,7 +991,9 @@ int main(int argc, char** argv) { &err); CheckNoError(err); } - rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); + rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes, + &err); + CheckNoError(err); CheckCondition(sizes[0] > 0); CheckCondition(sizes[1] > 0); } @@ -1117,9 +1122,8 @@ int main(int argc, char** argv) { // Essentially a fingerprint of the block-based Bloom schema CheckCondition(hits == 241); } else { - // Essentially a fingerprint of the full Bloom schema(s), - // format_version < 5, which vary for three different CACHE_LINE_SIZEs - CheckCondition(hits == 224 || hits == 180 || hits == 125); + // Essentially a fingerprint of full Bloom schema, format_version=5 + CheckCondition(hits == 188); } CheckCondition( (keys_to_query - hits) == @@ -1275,6 +1279,9 @@ int main(int argc, char** argv) { CheckPinGetCF(db, roptions, handles[1], "box", "c"); rocksdb_writebatch_destroy(wb); + rocksdb_flush_wal(db, 1, &err); + CheckNoError(err); + const char* keys[3] = { "box", "box", "barfooxx" }; const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] }; const size_t keys_sizes[3] = { 3, 3, 8 }; @@ -1758,6 +1765,28 @@ int main(int argc, char** argv) { rocksdb_options_set_atomic_flush(o, 1); CheckCondition(1 == rocksdb_options_get_atomic_flush(o)); + rocksdb_options_set_manual_wal_flush(o, 1); + CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o)); + + /* Blob Options */ + rocksdb_options_set_enable_blob_files(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_blob_files(o)); + + rocksdb_options_set_min_blob_size(o, 29); + CheckCondition(29 == rocksdb_options_get_min_blob_size(o)); + + rocksdb_options_set_blob_file_size(o, 30); + CheckCondition(30 == rocksdb_options_get_blob_file_size(o)); + + rocksdb_options_set_blob_compression_type(o, 4); + CheckCondition(4 == rocksdb_options_get_blob_compression_type(o)); + + rocksdb_options_set_enable_blob_gc(o, 1); + CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o)); + + rocksdb_options_set_blob_gc_age_cutoff(o, 0.75); + CheckCondition(0.75 == rocksdb_options_get_blob_gc_age_cutoff(o)); + // Create a copy that should be equal to the original. rocksdb_options_t* copy; copy = rocksdb_options_create_copy(o); @@ -2279,6 +2308,12 @@ int main(int argc, char** argv) { rocksdb_readoptions_set_ignore_range_deletions(ro, 1); CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro)); + rocksdb_readoptions_set_deadline(ro, 300); + CheckCondition(300 == rocksdb_readoptions_get_deadline(ro)); + + rocksdb_readoptions_set_io_timeout(ro, 400); + CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro)); + rocksdb_readoptions_destroy(ro); } @@ -2355,6 +2390,177 @@ int main(int argc, char** argv) { rocksdb_cache_destroy(co); } + StartPhase("jemalloc_nodump_allocator"); + { + rocksdb_memory_allocator_t* allocator; + allocator = rocksdb_jemalloc_nodump_allocator_create(&err); + if (err != NULL) { + // not supported on all platforms, allow unsupported error + const char* ni = "Not implemented: "; + size_t ni_len = strlen(ni); + size_t err_len = strlen(err); + + CheckCondition(err_len >= ni_len); + CheckCondition(memcmp(ni, err, ni_len) == 0); + Free(&err); + } else { + rocksdb_cache_t* co; + rocksdb_lru_cache_options_t* copts; + + copts = rocksdb_lru_cache_options_create(); + + rocksdb_lru_cache_options_set_capacity(copts, 100); + rocksdb_lru_cache_options_set_memory_allocator(copts, allocator); + + co = rocksdb_cache_create_lru_opts(copts); + CheckCondition(100 == rocksdb_cache_get_capacity(co)); + + rocksdb_cache_destroy(co); + rocksdb_lru_cache_options_destroy(copts); + } + rocksdb_memory_allocator_destroy(allocator); + } + + StartPhase("env"); + { + rocksdb_env_t* e; + e = rocksdb_create_default_env(); + + rocksdb_env_set_background_threads(e, 10); + CheckCondition(10 == rocksdb_env_get_background_threads(e)); + + rocksdb_env_set_high_priority_background_threads(e, 20); + CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e)); + + rocksdb_env_set_low_priority_background_threads(e, 30); + CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e)); + + rocksdb_env_set_bottom_priority_background_threads(e, 40); + CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e)); + + rocksdb_env_destroy(e); + } + + StartPhase("universal_compaction_options"); + { + rocksdb_universal_compaction_options_t* uco; + uco = rocksdb_universal_compaction_options_create(); + + rocksdb_universal_compaction_options_set_size_ratio(uco, 5); + CheckCondition(5 == + rocksdb_universal_compaction_options_get_size_ratio(uco)); + + rocksdb_universal_compaction_options_set_min_merge_width(uco, 15); + CheckCondition( + 15 == rocksdb_universal_compaction_options_get_min_merge_width(uco)); + + rocksdb_universal_compaction_options_set_max_merge_width(uco, 25); + CheckCondition( + 25 == rocksdb_universal_compaction_options_get_max_merge_width(uco)); + + rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco, + 35); + CheckCondition( + 35 == + rocksdb_universal_compaction_options_get_max_size_amplification_percent( + uco)); + + rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45); + CheckCondition( + 45 == + rocksdb_universal_compaction_options_get_compression_size_percent(uco)); + + rocksdb_universal_compaction_options_set_stop_style(uco, 1); + CheckCondition(1 == + rocksdb_universal_compaction_options_get_stop_style(uco)); + + rocksdb_universal_compaction_options_destroy(uco); + } + + StartPhase("fifo_compaction_options"); + { + rocksdb_fifo_compaction_options_t* fco; + fco = rocksdb_fifo_compaction_options_create(); + + rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000); + CheckCondition( + 100000 == + rocksdb_fifo_compaction_options_get_max_table_files_size(fco)); + + rocksdb_fifo_compaction_options_destroy(fco); + } + + StartPhase("backupable_db_option"); + { + rocksdb_backupable_db_options_t* bdo; + bdo = rocksdb_backupable_db_options_create("path"); + + rocksdb_backupable_db_options_set_share_table_files(bdo, 1); + CheckCondition(1 == + rocksdb_backupable_db_options_get_share_table_files(bdo)); + + rocksdb_backupable_db_options_set_sync(bdo, 1); + CheckCondition(1 == rocksdb_backupable_db_options_get_sync(bdo)); + + rocksdb_backupable_db_options_set_destroy_old_data(bdo, 1); + CheckCondition(1 == + rocksdb_backupable_db_options_get_destroy_old_data(bdo)); + + rocksdb_backupable_db_options_set_backup_log_files(bdo, 1); + CheckCondition(1 == + rocksdb_backupable_db_options_get_backup_log_files(bdo)); + + rocksdb_backupable_db_options_set_backup_rate_limit(bdo, 123); + CheckCondition(123 == + rocksdb_backupable_db_options_get_backup_rate_limit(bdo)); + + rocksdb_backupable_db_options_set_restore_rate_limit(bdo, 37); + CheckCondition(37 == + rocksdb_backupable_db_options_get_restore_rate_limit(bdo)); + + rocksdb_backupable_db_options_set_max_background_operations(bdo, 20); + CheckCondition( + 20 == rocksdb_backupable_db_options_get_max_background_operations(bdo)); + + rocksdb_backupable_db_options_set_callback_trigger_interval_size(bdo, 9000); + CheckCondition( + 9000 == + rocksdb_backupable_db_options_get_callback_trigger_interval_size(bdo)); + + rocksdb_backupable_db_options_set_max_valid_backups_to_open(bdo, 40); + CheckCondition( + 40 == rocksdb_backupable_db_options_get_max_valid_backups_to_open(bdo)); + + rocksdb_backupable_db_options_set_share_files_with_checksum_naming(bdo, 2); + CheckCondition( + 2 == rocksdb_backupable_db_options_get_share_files_with_checksum_naming( + bdo)); + + rocksdb_backupable_db_options_destroy(bdo); + } + + StartPhase("compression_options"); + { + rocksdb_options_t* co; + co = rocksdb_options_create(); + + rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100); + CheckCondition( + 100 == + rocksdb_options_get_compression_options_zstd_max_train_bytes(co)); + + rocksdb_options_set_compression_options_parallel_threads(co, 2); + CheckCondition( + 2 == rocksdb_options_get_compression_options_parallel_threads(co)); + + rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200); + CheckCondition( + 200 == + rocksdb_options_get_compression_options_max_dict_buffer_bytes(co)); + + rocksdb_options_destroy(co); + } + StartPhase("iterate_upper_bound"); { // Create new empty database @@ -2755,7 +2961,7 @@ int main(int argc, char** argv) { #else -int main() { +int main(void) { fprintf(stderr, "SKIPPED\n"); return 0; } diff --git a/db/column_family.cc b/db/column_family.cc index d0a16dd48c4..c168f2b1d12 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -16,6 +16,7 @@ #include #include +#include "db/blob/blob_file_cache.h" #include "db/compaction/compaction_picker.h" #include "db/compaction/compaction_picker_fifo.h" #include "db/compaction/compaction_picker_level.h" @@ -32,6 +33,7 @@ #include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "port/port.h" +#include "rocksdb/convenience.h" #include "rocksdb/table.h" #include "table/merging_iterator.h" #include "util/autovector.h" @@ -107,8 +109,9 @@ const Comparator* ColumnFamilyHandleImpl::GetComparator() const { void GetIntTblPropCollectorFactory( const ImmutableCFOptions& ioptions, - std::vector>* - int_tbl_prop_collector_factories) { + IntTblPropCollectorFactories* int_tbl_prop_collector_factories) { + assert(int_tbl_prop_collector_factories); + auto& collector_factories = ioptions.table_properties_collector_factories; for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); ++i) { @@ -212,7 +215,8 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, // if user sets arena_block_size, we trust user to use this value. Otherwise, // calculate a proper value from writer_buffer_size; if (result.arena_block_size <= 0) { - result.arena_block_size = result.write_buffer_size / 8; + result.arena_block_size = + std::min(size_t{1024 * 1024}, result.write_buffer_size / 8); // Align up to 4k const size_t align = 4 * 1024; @@ -281,7 +285,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, } if (result.level0_file_num_compaction_trigger == 0) { - ROCKS_LOG_WARN(db_options.info_log.get(), + ROCKS_LOG_WARN(db_options.logger, "level0_file_num_compaction_trigger cannot be 0"); result.level0_file_num_compaction_trigger = 1; } @@ -290,7 +294,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.level0_slowdown_writes_trigger || result.level0_slowdown_writes_trigger < result.level0_file_num_compaction_trigger) { - ROCKS_LOG_WARN(db_options.info_log.get(), + ROCKS_LOG_WARN(db_options.logger, "This condition must be satisfied: " "level0_stop_writes_trigger(%d) >= " "level0_slowdown_writes_trigger(%d) >= " @@ -307,7 +311,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.level0_slowdown_writes_trigger) { result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger; } - ROCKS_LOG_WARN(db_options.info_log.get(), + ROCKS_LOG_WARN(db_options.logger, "Adjust the value to " "level0_stop_writes_trigger(%d)" "level0_slowdown_writes_trigger(%d)" @@ -334,7 +338,9 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, // was not used) auto sfm = static_cast(db_options.sst_file_manager.get()); for (size_t i = 0; i < result.cf_paths.size(); i++) { - DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path); + DeleteScheduler::CleanupDirectory(db_options.env, sfm, + result.cf_paths[i].path) + .PermitUncheckedError(); } #endif @@ -448,9 +454,7 @@ void SuperVersion::Cleanup() { to_delete.push_back(m); } current->Unref(); - if (cfd->Unref()) { - delete cfd; - } + cfd->UnrefAndTryDelete(this); } void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, @@ -500,7 +504,8 @@ ColumnFamilyData::ColumnFamilyData( const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, const FileOptions& file_options, ColumnFamilySet* column_family_set, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : id_(id), name_(name), dummy_versions_(_dummy_versions), @@ -543,7 +548,7 @@ ColumnFamilyData::ColumnFamilyData( db_paths_registered_ = true; } else { ROCKS_LOG_ERROR( - ioptions_.info_log, + ioptions_.logger, "Failed to register data paths of column family (id: %d, name: %s)", id_, name_.c_str()); } @@ -556,9 +561,14 @@ ColumnFamilyData::ColumnFamilyData( // if _dummy_versions is nullptr, then this is a dummy column family. if (_dummy_versions != nullptr) { internal_stats_.reset( - new InternalStats(ioptions_.num_levels, db_options.env, this)); + new InternalStats(ioptions_.num_levels, ioptions_.clock, this)); table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache, - block_cache_tracer, io_tracer)); + block_cache_tracer, io_tracer, + db_session_id)); + blob_file_cache_.reset( + new BlobFileCache(_table_cache, ioptions(), soptions(), id_, + internal_stats_->GetBlobFileReadHist(), io_tracer)); + if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); @@ -572,13 +582,13 @@ ColumnFamilyData::ColumnFamilyData( } else if (ioptions_.compaction_style == kCompactionStyleNone) { compaction_picker_.reset(new NullCompactionPicker( ioptions_, &internal_comparator_)); - ROCKS_LOG_WARN(ioptions_.info_log, + ROCKS_LOG_WARN(ioptions_.logger, "Column family %s does not use any background compaction. " "Compactions can only be done via CompactFiles\n", GetName().c_str()); #endif // !ROCKSDB_LITE } else { - ROCKS_LOG_ERROR(ioptions_.info_log, + ROCKS_LOG_ERROR(ioptions_.logger, "Unable to recognize the specified compaction style %d. " "Column family %s will use kCompactionStyleLevel.\n", ioptions_.compaction_style, GetName().c_str()); @@ -587,12 +597,12 @@ ColumnFamilyData::ColumnFamilyData( } if (column_family_set_->NumberOfColumnFamilies() < 10) { - ROCKS_LOG_INFO(ioptions_.info_log, + ROCKS_LOG_INFO(ioptions_.logger, "--------------- Options for column family [%s]:\n", name.c_str()); - initial_cf_options_.Dump(ioptions_.info_log); + initial_cf_options_.Dump(ioptions_.logger); } else { - ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n"); + ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n"); } } @@ -627,7 +637,7 @@ ColumnFamilyData::~ColumnFamilyData() { if (dummy_versions_ != nullptr) { // List must be empty - assert(dummy_versions_->TEST_Next() == dummy_versions_); + assert(dummy_versions_->Next() == dummy_versions_); bool deleted __attribute__((__unused__)); deleted = dummy_versions_->Unref(); assert(deleted); @@ -648,14 +658,14 @@ ColumnFamilyData::~ColumnFamilyData() { Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths()); if (!s.ok()) { ROCKS_LOG_ERROR( - ioptions_.info_log, + ioptions_.logger, "Failed to unregister data paths of column family (id: %d, name: %s)", id_, name_.c_str()); } } } -bool ColumnFamilyData::UnrefAndTryDelete() { +bool ColumnFamilyData::UnrefAndTryDelete(SuperVersion* sv_under_cleanup) { int old_refs = refs_.fetch_sub(1); assert(old_refs > 0); @@ -665,7 +675,11 @@ bool ColumnFamilyData::UnrefAndTryDelete() { return true; } - if (old_refs == 2 && super_version_ != nullptr) { + // If called under SuperVersion::Cleanup, we should not re-enter Cleanup on + // the same SuperVersion. (But while installing a new SuperVersion, this + // cfd could be referenced only by two SuperVersions.) + if (old_refs == 2 && super_version_ != nullptr && + super_version_ != sv_under_cleanup) { // Only the super_version_ holds me SuperVersion* sv = super_version_; super_version_ = nullptr; @@ -703,9 +717,7 @@ uint64_t ColumnFamilyData::OldestLogToKeep() { auto current_log = GetLogNumber(); if (allow_2pc_) { - autovector empty_list; - auto imm_prep_log = - imm()->PrecomputeMinLogContainingPrepSection(empty_list); + auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection(); auto mem_prep_log = mem()->GetMinLogContainingPrepSection(); if (imm_prep_log > 0 && imm_prep_log < current_log) { @@ -827,7 +839,8 @@ std::pair ColumnFamilyData::GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, uint64_t num_compaction_needed_bytes, - const MutableCFOptions& mutable_cf_options) { + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options) { if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) { return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit}; } else if (!mutable_cf_options.disable_auto_compactions && @@ -841,7 +854,9 @@ ColumnFamilyData::GetWriteStallConditionAndCause( WriteStallCause::kPendingCompactionBytes}; } else if (mutable_cf_options.max_write_buffer_number > 3 && num_unflushed_memtables >= - mutable_cf_options.max_write_buffer_number - 1) { + mutable_cf_options.max_write_buffer_number - 1 && + num_unflushed_memtables - 1 >= + immutable_cf_options.min_write_buffer_number_to_merge) { return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit}; } else if (!mutable_cf_options.disable_auto_compactions && mutable_cf_options.level0_slowdown_writes_trigger >= 0 && @@ -869,7 +884,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( auto write_stall_condition_and_cause = GetWriteStallConditionAndCause( imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(), - vstorage->estimated_compaction_needed_bytes(), mutable_cf_options); + vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, + *ioptions()); write_stall_condition = write_stall_condition_and_cause.first; auto write_stall_cause = write_stall_condition_and_cause.second; @@ -881,7 +897,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stopping writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d", name_.c_str(), imm()->NumNotFlushed(), @@ -894,7 +910,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( internal_stats_->AddCFStats( InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1); } - ROCKS_LOG_WARN(ioptions_.info_log, + ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stopping writes because we have %d level-0 files", name_.c_str(), vstorage->l0_delay_trigger_count()); } else if (write_stall_condition == WriteStallCondition::kStopped && @@ -903,7 +919,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stopping writes because of estimated pending compaction " "bytes %" PRIu64, name_.c_str(), compaction_needed_bytes); @@ -915,7 +931,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stalling writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d " "rate %" PRIu64, @@ -937,7 +953,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( internal_stats_->AddCFStats( InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1); } - ROCKS_LOG_WARN(ioptions_.info_log, + ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stalling writes because we have %d level-0 files " "rate %" PRIu64, name_.c_str(), vstorage->l0_delay_trigger_count(), @@ -962,7 +978,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); ROCKS_LOG_WARN( - ioptions_.info_log, + ioptions_.logger, "[%s] Stalling writes because of estimated pending compaction " "bytes %" PRIu64 " rate %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes(), @@ -976,7 +992,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( write_controller_token_ = write_controller->GetCompactionPressureToken(); ROCKS_LOG_INFO( - ioptions_.info_log, + ioptions_.logger, "[%s] Increasing compaction threads because we have %d level-0 " "files ", name_.c_str(), vstorage->l0_delay_trigger_count()); @@ -990,7 +1006,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( write_controller->GetCompactionPressureToken(); if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { ROCKS_LOG_INFO( - ioptions_.info_log, + ioptions_.logger, "[%s] Increasing compaction threads because of estimated pending " "compaction " "bytes %" PRIu64, @@ -1083,7 +1099,7 @@ bool ColumnFamilyData::RangeOverlapWithCompaction( Status ColumnFamilyData::RangesOverlapWithMemtables( const autovector& ranges, SuperVersion* super_version, - bool* overlap) { + bool allow_data_in_errors, bool* overlap) { assert(overlap != nullptr); *overlap = false; // Create an InternalIterator over all unflushed memtables @@ -1116,13 +1132,12 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( memtable_iter->Seek(range_start.Encode()); status = memtable_iter->status(); ParsedInternalKey seek_result; - if (status.ok()) { - if (memtable_iter->Valid() && - ParseInternalKey(memtable_iter->key(), &seek_result) != - Status::OK()) { - status = Status::Corruption("DB have corrupted keys"); - } + + if (status.ok() && memtable_iter->Valid()) { + status = ParseInternalKey(memtable_iter->key(), &seek_result, + allow_data_in_errors); } + if (status.ok()) { if (memtable_iter->Valid() && ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { @@ -1192,11 +1207,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { SuperVersion* sv = static_cast(ptr); if (sv == SuperVersion::kSVObsolete || sv->version_number != super_version_number_.load()) { - RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES); + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; if (sv && sv->Unref()) { - RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS); + RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); db->mutex()->Lock(); // NOTE: underlying resources held by superversion (sst files) might // not be released until the next background job. @@ -1335,24 +1350,39 @@ Status ColumnFamilyData::ValidateOptions( "Block-Based Table format. "); } } + + if (cf_options.enable_blob_garbage_collection && + (cf_options.blob_garbage_collection_age_cutoff < 0.0 || + cf_options.blob_garbage_collection_age_cutoff > 1.0)) { + return Status::InvalidArgument( + "The age cutoff for blob garbage collection should be in the range " + "[0.0, 1.0]."); + } + + if (cf_options.compaction_style == kCompactionStyleFIFO && + db_options.max_open_files != -1 && cf_options.ttl > 0) { + return Status::NotSupported( + "FIFO compaction only supported with max_open_files = -1."); + } + return s; } #ifndef ROCKSDB_LITE Status ColumnFamilyData::SetOptions( - const DBOptions& db_options, + const DBOptions& db_opts, const std::unordered_map& options_map) { - MutableCFOptions new_mutable_cf_options; - Status s = - GetMutableOptionsFromStrings(mutable_cf_options_, options_map, - ioptions_.info_log, &new_mutable_cf_options); + ColumnFamilyOptions cf_opts = + BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); + ConfigOptions config_opts; + config_opts.mutable_options_only = true; + Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map, + &cf_opts); if (s.ok()) { - ColumnFamilyOptions cf_options = - BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options); - s = ValidateOptions(db_options, cf_options); + s = ValidateOptions(db_opts, cf_opts); } if (s.ok()) { - mutable_cf_options_ = new_mutable_cf_options; + mutable_cf_options_ = MutableCFOptions(cf_opts); mutable_cf_options_.RefreshDerivedOptions(ioptions_); } return s; @@ -1391,7 +1421,8 @@ Status ColumnFamilyData::AddDirectories( if (existing_dir == created_dirs->end()) { std::unique_ptr path_directory; - s = DBImpl::CreateAndNewDirectory(ioptions_.fs, p.path, &path_directory); + s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path, + &path_directory); if (!s.ok()) { return s; } @@ -1422,12 +1453,13 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, WriteBufferManager* _write_buffer_manager, WriteController* _write_controller, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : max_column_family_(0), dummy_cfd_(new ColumnFamilyData( ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, file_options, nullptr, - block_cache_tracer, io_tracer)), + block_cache_tracer, io_tracer, db_session_id)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), @@ -1436,7 +1468,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, write_buffer_manager_(_write_buffer_manager), write_controller_(_write_controller), block_cache_tracer_(block_cache_tracer), - io_tracer_(io_tracer) { + io_tracer_(io_tracer), + db_session_id_(db_session_id) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->next_ = dummy_cfd_; @@ -1502,7 +1535,8 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData( id, name, dummy_versions, table_cache_, write_buffer_manager_, options, - *db_options_, file_options_, this, block_cache_tracer_, io_tracer_); + *db_options_, file_options_, this, block_cache_tracer_, io_tracer_, + db_session_id_); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); max_column_family_ = std::max(max_column_family_, id); diff --git a/db/column_family.h b/db/column_family.h index 0a251e54537..7ad560e44d9 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -44,6 +44,7 @@ class LogBuffer; class InstrumentedMutex; class InstrumentedMutexLock; struct SuperVersionContext; +class BlobFileCache; extern const double kIncSlowdownRatio; // This file contains a list of data structures for managing column family @@ -252,13 +253,12 @@ extern Status CheckCFPathsSupported(const DBOptions& db_options, extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const ColumnFamilyOptions& src); -// Wrap user defined table proproties collector factories `from cf_options` +// Wrap user defined table properties collector factories `from cf_options` // into internal ones in int_tbl_prop_collector_factories. Add a system internal // one too. extern void GetIntTblPropCollectorFactory( const ImmutableCFOptions& ioptions, - std::vector>* - int_tbl_prop_collector_factories); + IntTblPropCollectorFactories* int_tbl_prop_collector_factories); class ColumnFamilySet; @@ -278,21 +278,11 @@ class ColumnFamilyData { // holding a DB mutex, or as the leader in a write batch group). void Ref() { refs_.fetch_add(1); } - // Unref decreases the reference count, but does not handle deletion - // when the count goes to 0. If this method returns true then the - // caller should delete the instance immediately, or later, by calling - // FreeDeadColumnFamilies(). Unref() can only be called while holding - // a DB mutex, or during single-threaded recovery. - bool Unref() { - int old_refs = refs_.fetch_sub(1); - assert(old_refs > 0); - return old_refs == 1; - } - // UnrefAndTryDelete() decreases the reference count and do free if needed, // return true if this is freed else false, UnrefAndTryDelete() can only // be called while holding a DB mutex, or during single-threaded recovery. - bool UnrefAndTryDelete(); + // sv_under_cleanup is only provided when called from SuperVersion::Cleanup. + bool UnrefAndTryDelete(SuperVersion* sv_under_cleanup = nullptr); // SetDropped() can only be called under following conditions: // 1) Holding a DB mutex, @@ -325,7 +315,7 @@ class ColumnFamilyData { FlushReason GetFlushReason() const { return flush_reason_; } // thread-safe const FileOptions* soptions() const; - const ImmutableCFOptions* ioptions() const { return &ioptions_; } + const ImmutableOptions* ioptions() const { return &ioptions_; } // REQUIRES: DB mutex held // This returns the MutableCFOptions used by current SuperVersion // You should use this API to reference MutableCFOptions most of the time. @@ -359,6 +349,11 @@ class ColumnFamilyData { MemTableList* imm() { return &imm_; } MemTable* mem() { return mem_; } + + bool IsEmpty() { + return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; + } + Version* current() { return current_; } Version* dummy_versions() { return dummy_versions_; } void SetCurrent(Version* _current); @@ -381,6 +376,7 @@ class ColumnFamilyData { SequenceNumber earliest_seq); TableCache* table_cache() const { return table_cache_.get(); } + BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); } // See documentation in compaction_picker.h // REQUIRES: DB mutex held @@ -404,7 +400,8 @@ class ColumnFamilyData { // // Thread-safe Status RangesOverlapWithMemtables(const autovector& ranges, - SuperVersion* super_version, bool* overlap); + SuperVersion* super_version, + bool allow_data_in_errors, bool* overlap); // A flag to tell a manual compaction is to compact all levels together // instead of a specific level. @@ -430,8 +427,7 @@ class ColumnFamilyData { return internal_comparator_; } - const std::vector>* - int_tbl_prop_collector_factories() const { + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const { return &int_tbl_prop_collector_factories_; } @@ -443,7 +439,7 @@ class ColumnFamilyData { // Get SuperVersion stored in thread local storage. If it does not exist, // get a reference from a current SuperVersion. SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); - // Try to return SuperVersion back to thread local storage. Retrun true on + // Try to return SuperVersion back to thread local storage. Return true on // success and false on failure. It fails when the thread local storage // contains anything other than SuperVersion::kSVInUse flag. bool ReturnThreadLocalSuperVersion(SuperVersion* sv); @@ -477,9 +473,11 @@ class ColumnFamilyData { kPendingCompactionBytes, }; static std::pair - GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files, - uint64_t num_compaction_needed_bytes, - const MutableCFOptions& mutable_cf_options); + GetWriteStallConditionAndCause( + int num_unflushed_memtables, int num_l0_files, + uint64_t num_compaction_needed_bytes, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& immutable_cf_options); // Recalculate some small conditions, which are changed only during // compaction, adding new memtable and/or @@ -506,6 +504,21 @@ class ColumnFamilyData { FSDirectory* GetDataDir(size_t path_id) const; + // full_history_ts_low_ can only increase. + void SetFullHistoryTsLow(std::string ts_low) { + assert(!ts_low.empty()); + const Comparator* ucmp = user_comparator(); + assert(ucmp); + if (full_history_ts_low_.empty() || + ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { + full_history_ts_low_ = std::move(ts_low); + } + } + + const std::string& GetFullHistoryTsLow() const { + return full_history_ts_low_; + } + ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } private: @@ -519,7 +532,8 @@ class ColumnFamilyData { const FileOptions& file_options, ColumnFamilySet* column_family_set, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer); + const std::shared_ptr& io_tracer, + const std::string& db_session_id); std::vector GetDbPaths() const; @@ -533,16 +547,16 @@ class ColumnFamilyData { std::atomic dropped_; // true if client dropped it const InternalKeyComparator internal_comparator_; - std::vector> - int_tbl_prop_collector_factories_; + IntTblPropCollectorFactories int_tbl_prop_collector_factories_; const ColumnFamilyOptions initial_cf_options_; - const ImmutableCFOptions ioptions_; + const ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; const bool is_delete_range_supported_; std::unique_ptr table_cache_; + std::unique_ptr blob_file_cache_; std::unique_ptr internal_stats_; @@ -601,6 +615,8 @@ class ColumnFamilyData { std::vector> data_dirs_; bool db_paths_registered_; + + std::string full_history_ts_low_; }; // ColumnFamilySet has interesting thread-safety requirements @@ -653,7 +669,8 @@ class ColumnFamilySet { WriteBufferManager* _write_buffer_manager, WriteController* _write_controller, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer); + const std::shared_ptr& io_tracer, + const std::string& db_session_id); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -718,6 +735,7 @@ class ColumnFamilySet { WriteController* write_controller_; BlockCacheTracer* const block_cache_tracer_; std::shared_ptr io_tracer_; + std::string db_session_id_; }; // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access diff --git a/db/column_family_test.cc b/db/column_family_test.cc index fcb71926802..2db49813abe 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -35,10 +35,10 @@ namespace ROCKSDB_NAMESPACE { static const int kValueSize = 1000; // counts how many operations were performed -class EnvCounter : public EnvWrapper { +class EnvCounter : public SpecialEnv { public: explicit EnvCounter(Env* base) - : EnvWrapper(base), num_new_writable_file_(0) {} + : SpecialEnv(base), num_new_writable_file_(0) {} int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; } @@ -56,23 +56,16 @@ class ColumnFamilyTestBase : public testing::Test { public: explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) { Env* base_env = Env::Default(); -#ifndef ROCKSDB_LITE - const char* test_env_uri = getenv("TEST_ENV_URI"); - if (test_env_uri) { - Env* test_env = nullptr; - Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); - base_env = test_env; - EXPECT_OK(s); - EXPECT_NE(Env::Default(), base_env); - } -#endif // !ROCKSDB_LITE + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); EXPECT_NE(nullptr, base_env); env_ = new EnvCounter(base_env); + env_->skip_fsync_ = true; dbname_ = test::PerThreadDBPath("column_family_test"); db_options_.create_if_missing = true; db_options_.fail_if_options_file_error = true; db_options_.env = env_; - DestroyDB(dbname_, Options(db_options_, column_family_options_)); + EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); } ~ColumnFamilyTestBase() override { @@ -87,7 +80,6 @@ class ColumnFamilyTestBase : public testing::Test { #endif // ROCKSDB_LITE column_families.push_back(cfdescriptor); } - Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); Destroy(column_families); delete env_; @@ -187,8 +179,8 @@ class ColumnFamilyTestBase : public testing::Test { std::vector column_families; names_.clear(); for (size_t i = 0; i < cf.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor( - cf[i], options.size() == 0 ? column_family_options_ : options[i])); + column_families.emplace_back( + cf[i], options.size() == 0 ? column_family_options_ : options[i]); names_.push_back(cf[i]); } return DB::Open(db_options_, dbname_, column_families, &handles_, &db_); @@ -199,8 +191,8 @@ class ColumnFamilyTestBase : public testing::Test { std::vector column_families; names_.clear(); for (size_t i = 0; i < cf.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor( - cf[i], options.size() == 0 ? column_family_options_ : options[i])); + column_families.emplace_back( + cf[i], options.size() == 0 ? column_family_options_ : options[i]); names_.push_back(cf[i]); } return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_, @@ -653,8 +645,8 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) { // after flushing file B is deleted. At the same time, the min log number of // default CF is not written to manifest. Log file A still remains. // Flushed to SST file Y. - Flush(1); - Flush(0); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(0)); ASSERT_OK(Put(1, "bar", "v3")); // seqID 4 ASSERT_OK(Put(1, "foo", "v4")); // seqID 5 ASSERT_OK(db_->FlushWAL(/*sync=*/false)); @@ -708,15 +700,15 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) { // and is set to current. Both CFs' min log number is set to file C so after // flushing file B is deleted. Log file A still remains. // Flushed to SST file Y. - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(0, "bar", "v2")); // seqID 4 ASSERT_OK(Put(2, "bar", "v2")); // seqID 5 ASSERT_OK(Put(1, "bar", "v3")); // seqID 6 // Flushing all column families. This forces all CFs' min log to current. This // is written to the manifest file. Log file C is cleared. - Flush(0); - Flush(1); - Flush(2); + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(2)); // Write to log file D ASSERT_OK(Put(1, "bar", "v4")); // seqID 7 ASSERT_OK(Put(1, "bar", "v5")); // seqID 8 @@ -898,9 +890,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) { std::vector old_files; ASSERT_OK(env_->GetChildren(backup_logs, &old_files)); for (auto& file : old_files) { - if (file != "." && file != "..") { - ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); - } + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); } column_family_options_.merge_operator = @@ -929,9 +919,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) { std::vector logs; ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); - } + CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log); } // recover the DB @@ -956,9 +944,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) { if (iter == 0) { // copy the logs from backup back to wal dir for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); - } + CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log); } } } @@ -985,7 +971,7 @@ TEST_P(ColumnFamilyTest, FlushTest) { for (int i = 0; i < 3; ++i) { uint64_t max_total_in_memory_state = MaxTotalInMemoryState(); - Flush(i); + ASSERT_OK(Flush(i)); AssertMaxTotalInMemoryState(max_total_in_memory_state); } ASSERT_OK(Put(1, "foofoo", "bar")); @@ -1093,7 +1079,7 @@ TEST_P(ColumnFamilyTest, CrashAfterFlush) { ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar"))); ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar"))); ASSERT_OK(db_->Write(WriteOptions(), &batch)); - Flush(0); + ASSERT_OK(Flush(0)); fault_env->SetFilesystemActive(false); std::vector names; @@ -1103,7 +1089,7 @@ TEST_P(ColumnFamilyTest, CrashAfterFlush) { } } Close(); - fault_env->DropUnsyncedFileData(); + ASSERT_OK(fault_env->DropUnsyncedFileData()); fault_env->ResetState(); Open(names, {}); @@ -2236,7 +2222,7 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { // files for column family [one], because it's empty AssertCountLiveFiles(4); - Flush(0); + ASSERT_OK(Flush(0)); ASSERT_EQ(0, dbfull()->TEST_total_log_size()); Close(); } @@ -2292,6 +2278,8 @@ TEST_P(ColumnFamilyTest, SanitizeOptions) { // not a multiple of 4k, round up 4k expected_arena_block_size += 4 * 1024; } + expected_arena_block_size = + std::min(size_t{1024 * 1024}, expected_arena_block_size); ASSERT_EQ(expected_arena_block_size, result.arena_block_size); } } @@ -3040,7 +3028,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) { Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]); ASSERT_OK(it->status()); // A flush will make `it` hold the last reference of its super version. - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(0, "fodor", "mirko")); @@ -3093,7 +3081,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) { Iterator* it = db_->NewIterator(ro, handles_[1]); ASSERT_OK(it->status()); // A flush will make `it` hold the last reference of its super version. - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(0, "fodor", "mirko")); @@ -3147,7 +3135,7 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { CreateColumnFamilies({"one"}); ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(1, "fodar2", "mirko")); - Flush(1); + ASSERT_OK(Flush(1)); // Create an iterator holding the current super version, as well as // the SST file just flushed. @@ -3159,7 +3147,7 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { ASSERT_OK(Put(1, "fodor", "mirko")); ASSERT_OK(Put(1, "fodar2", "mirko")); - Flush(1); + ASSERT_OK(Flush(1)); WaitForCompaction(); @@ -3232,9 +3220,9 @@ TEST_P(ColumnFamilyTest, LogSyncConflictFlush) { ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); }); TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1"); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2"); @@ -3256,7 +3244,7 @@ TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) { Build(0, 100); // Flush the 0th column family to force a roll of the wal log - Flush(0); + ASSERT_OK(Flush(0)); // Add some more entries Build(100, 100); @@ -3271,7 +3259,7 @@ TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) { FileType type; if (!(ParseFileName(filenames[i], &number, &type))) continue; - if (type != kLogFile) continue; + if (type != kWalFile) continue; logfs.push_back(filenames[i]); } @@ -3332,14 +3320,14 @@ TEST_P(ColumnFamilyTest, DefaultCfPathsTest) { // Fill Column family 1. PutRandomData(1, 100, 100); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(dbname_)); // Fill column family 2 PutRandomData(2, 100, 100); - Flush(2); + ASSERT_OK(Flush(2)); // SST from Column family 2 should be generated in // db_paths which is dbname_ in this case. @@ -3358,14 +3346,14 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) { Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2}); PutRandomData(1, 100, 100, true /* save */); - Flush(1); + ASSERT_OK(Flush(1)); // Check that files are generated in appropriate paths. ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(dbname_)); PutRandomData(2, 100, 100, true /* save */); - Flush(2); + ASSERT_OK(Flush(2)); ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path)); ASSERT_EQ(0, GetSstFileCount(dbname_)); @@ -3391,6 +3379,30 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) { } } +TEST(ColumnFamilyTest, ValidateBlobGCCutoff) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_age_cutoff = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_age_cutoff = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_age_cutoff = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); +} + } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 048ed6e26f1..4793adddf7c 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -91,9 +91,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { // create couple files // Background compaction starts and waits in BackgroundCallCompaction:0 for (int i = 0; i < kLevel0Trigger * 4; ++i) { - db->Put(WriteOptions(), ToString(i), ""); - db->Put(WriteOptions(), ToString(100 - i), ""); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), "")); + ASSERT_OK(db->Put(WriteOptions(), ToString(100 - i), "")); + ASSERT_OK(db->Flush(FlushOptions())); } ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; @@ -118,6 +118,78 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { delete db; } +TEST_F(CompactFilesTest, MultipleLevel) { + Options options; + options.create_if_missing = true; + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = 6; + // Add listener + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); + + // create couple files in L0, L3, L4 and L5 + for (int i = 5; i > 2; --i) { + collector->ClearFlushedFiles(); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), "")); + ASSERT_OK(db->Flush(FlushOptions())); + auto l0_files = collector->GetFlushedFiles(); + ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i)); + + std::string prop; + ASSERT_TRUE( + db->GetProperty("rocksdb.num-files-at-level" + ToString(i), &prop)); + ASSERT_EQ("1", prop); + } + ASSERT_OK(db->Put(WriteOptions(), ToString(0), "")); + ASSERT_OK(db->Flush(FlushOptions())); + + ColumnFamilyMetaData meta; + db->GetColumnFamilyMetaData(&meta); + // Compact files except the file in L3 + std::vector files; + for (int i = 0; i < 6; ++i) { + if (i == 3) continue; + for (auto& file : meta.levels[i].files) { + files.push_back(file.db_path + "/" + file.name); + } + } + + SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"}, + {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::thread thread([&] { + TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0"); + ASSERT_OK(db->Put(WriteOptions(), "bar", "v2")); + ASSERT_OK(db->Put(WriteOptions(), "foo", "v2")); + ASSERT_OK(db->Flush(FlushOptions())); + TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1"); + }); + + // Compaction cannot move up the data to higher level + // here we have input file from level 5, so the output level has to be >= 5 + for (int invalid_output_level = 0; invalid_output_level < 5; + invalid_output_level++) { + s = db->CompactFiles(CompactionOptions(), files, invalid_output_level); + std::cout << s.ToString() << std::endl; + ASSERT_TRUE(s.IsInvalidArgument()); + } + + ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5)); + SyncPoint::GetInstance()->DisableProcessing(); + thread.join(); + + delete db; +} + TEST_F(CompactFilesTest, ObsoleteFiles) { Options options; // to trigger compaction more easily @@ -138,18 +210,18 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { DB* db = nullptr; DestroyDB(db_name_, options); Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); - assert(db); + ASSERT_OK(s); + ASSERT_NE(db, nullptr); // create couple files for (int i = 1000; i < 2000; ++i) { - db->Put(WriteOptions(), ToString(i), - std::string(kWriteBufferSize / 10, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(kWriteBufferSize / 10, 'a' + (i % 26)))); } auto l0_files = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); - static_cast_with_check(db)->TEST_WaitForCompact(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForCompact()); // verify all compaction input files are deleted for (auto fname : l0_files) { @@ -182,15 +254,17 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) { // create couple files for (int i = 0; i < 500; ++i) { - db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(1000, 'a' + (i % 26)))); } - static_cast_with_check(db)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); auto l0_files_1 = collector->GetFlushedFiles(); collector->ClearFlushedFiles(); for (int i = 0; i < 500; ++i) { - db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(1000, 'a' + (i % 26)))); } - static_cast_with_check(db)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); auto l0_files_2 = collector->GetFlushedFiles(); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0)); ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0)); @@ -213,13 +287,13 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { DB* db = nullptr; DestroyDB(db_name_, options); Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); + ASSERT_OK(s); assert(db); // Create 5 files. for (int i = 0; i < 5; ++i) { - db->Put(WriteOptions(), "key" + ToString(i), "value"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "key" + ToString(i), "value")); + ASSERT_OK(db->Flush(FlushOptions())); } auto l0_files = collector->GetFlushedFiles(); @@ -237,8 +311,8 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { // In the meantime flush another file. TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0"); - db->Put(WriteOptions(), "key5", "value"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "key5", "value")); + ASSERT_OK(db->Flush(FlushOptions())); TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1"); compaction_thread.join(); @@ -249,7 +323,7 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) { // Make sure we can reopen the DB. s = DB::Open(options, db_name_, &db); - ASSERT_TRUE(s.ok()); + ASSERT_OK(s); assert(db); delete db; } @@ -293,8 +367,8 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { cf->SetDB(db); // Write one L0 file - db->Put(WriteOptions(), "K1", "V1"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "K1", "V1")); + ASSERT_OK(db->Flush(FlushOptions())); // Compact all L0 files using CompactFiles ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta; @@ -337,8 +411,8 @@ TEST_F(CompactFilesTest, SentinelCompressionType) { DB* db = nullptr; ASSERT_OK(DB::Open(options, db_name_, &db)); - db->Put(WriteOptions(), "key", "val"); - db->Flush(FlushOptions()); + ASSERT_OK(db->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db->Flush(FlushOptions())); auto l0_files = collector->GetFlushedFiles(); ASSERT_EQ(1, l0_files.size()); @@ -377,14 +451,15 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) { DB* db = nullptr; DestroyDB(db_name_, options); Status s = DB::Open(options, db_name_, &db); - assert(s.ok()); + ASSERT_OK(s); assert(db); // create couple files for (int i = 0; i < 500; ++i) { - db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26))); + ASSERT_OK(db->Put(WriteOptions(), ToString(i), + std::string(1000, 'a' + (i % 26)))); } - static_cast_with_check(db)->TEST_WaitForFlushMemTable(); + ASSERT_OK(static_cast_with_check(db)->TEST_WaitForFlushMemTable()); auto l0_files_1 = collector->GetFlushedFiles(); CompactionOptions co; co.compression = CompressionType::kLZ4Compression; diff --git a/db/compaction/clipping_iterator.h b/db/compaction/clipping_iterator.h new file mode 100644 index 00000000000..b287b653e10 --- /dev/null +++ b/db/compaction/clipping_iterator.h @@ -0,0 +1,275 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// An internal iterator that wraps another one and ensures that any keys +// returned are strictly within a range [start, end). If the underlying +// iterator has already performed the bounds checking, it relies on that result; +// otherwise, it performs the necessary key comparisons itself. Both bounds +// are optional. +class ClippingIterator : public InternalIterator { + public: + ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end, + const Comparator* cmp) + : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) { + assert(iter_); + assert(cmp_); + assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0); + + UpdateAndEnforceBounds(); + } + + bool Valid() const override { return valid_; } + + void SeekToFirst() override { + if (start_) { + iter_->Seek(*start_); + } else { + iter_->SeekToFirst(); + } + + UpdateAndEnforceUpperBound(); + } + + void SeekToLast() override { + if (end_) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + } else { + iter_->SeekToLast(); + } + + UpdateAndEnforceLowerBound(); + } + + void Seek(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + iter_->Seek(*start_); + UpdateAndEnforceUpperBound(); + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + valid_ = false; + return; + } + + iter_->Seek(target); + UpdateAndEnforceUpperBound(); + } + + void SeekForPrev(const Slice& target) override { + if (start_ && cmp_->Compare(target, *start_) < 0) { + valid_ = false; + return; + } + + if (end_ && cmp_->Compare(target, *end_) >= 0) { + iter_->SeekForPrev(*end_); + + // Upper bound is exclusive, so we need a key which is strictly smaller + if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) { + iter_->Prev(); + } + + UpdateAndEnforceLowerBound(); + return; + } + + iter_->SeekForPrev(target); + UpdateAndEnforceLowerBound(); + } + + void Next() override { + assert(valid_); + iter_->Next(); + UpdateAndEnforceUpperBound(); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(valid_); + assert(result); + + IterateResult res; + valid_ = iter_->NextAndGetResult(&res); + + if (!valid_) { + return false; + } + + if (end_) { + EnforceUpperBoundImpl(res.bound_check_result); + + if (!valid_) { + return false; + } + } + + res.bound_check_result = IterBoundCheck::kInbound; + *result = res; + + return true; + } + + void Prev() override { + assert(valid_); + iter_->Prev(); + UpdateAndEnforceLowerBound(); + } + + Slice key() const override { + assert(valid_); + return iter_->key(); + } + + Slice user_key() const override { + assert(valid_); + return iter_->user_key(); + } + + Slice value() const override { + assert(valid_); + return iter_->value(); + } + + Status status() const override { return iter_->status(); } + + bool PrepareValue() override { + assert(valid_); + + if (iter_->PrepareValue()) { + return true; + } + + assert(!iter_->Valid()); + valid_ = false; + return false; + } + + bool MayBeOutOfLowerBound() override { + assert(valid_); + return false; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(valid_); + return IterBoundCheck::kInbound; + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool IsKeyPinned() const override { + assert(valid_); + return iter_->IsKeyPinned(); + } + + bool IsValuePinned() const override { + assert(valid_); + return iter_->IsValuePinned(); + } + + Status GetProperty(std::string prop_name, std::string* prop) override { + return iter_->GetProperty(prop_name, prop); + } + + private: + void UpdateValid() { + assert(!iter_->Valid() || iter_->status().ok()); + + valid_ = iter_->Valid(); + } + + void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) { + if (bound_check_result == IterBoundCheck::kInbound) { + return; + } + + if (bound_check_result == IterBoundCheck::kOutOfBound) { + valid_ = false; + return; + } + + assert(bound_check_result == IterBoundCheck::kUnknown); + + if (cmp_->Compare(key(), *end_) >= 0) { + valid_ = false; + } + } + + void EnforceUpperBound() { + if (!valid_) { + return; + } + + if (!end_) { + return; + } + + EnforceUpperBoundImpl(iter_->UpperBoundCheckResult()); + } + + void EnforceLowerBound() { + if (!valid_) { + return; + } + + if (!start_) { + return; + } + + if (!iter_->MayBeOutOfLowerBound()) { + return; + } + + if (cmp_->Compare(key(), *start_) < 0) { + valid_ = false; + } + } + + void AssertBounds() { + assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0); + assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0); + } + + void UpdateAndEnforceBounds() { + UpdateValid(); + EnforceUpperBound(); + EnforceLowerBound(); + AssertBounds(); + } + + void UpdateAndEnforceUpperBound() { + UpdateValid(); + EnforceUpperBound(); + AssertBounds(); + } + + void UpdateAndEnforceLowerBound() { + UpdateValid(); + EnforceLowerBound(); + AssertBounds(); + } + + InternalIterator* iter_; + const Slice* start_; + const Slice* end_; + const Comparator* cmp_; + bool valid_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/clipping_iterator_test.cc b/db/compaction/clipping_iterator_test.cc new file mode 100644 index 00000000000..3a31b61eb49 --- /dev/null +++ b/db/compaction/clipping_iterator_test.cc @@ -0,0 +1,256 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/compaction/clipping_iterator.h" + +#include +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +// A vector iterator which does its own bounds checking. This is for testing the +// optimizations in the clipping iterator where we bypass the bounds checking if +// the input iterator has already performed it. +class BoundsCheckingVectorIterator : public test::VectorIterator { + public: + BoundsCheckingVectorIterator(const std::vector& keys, + const std::vector& values, + const Slice* start, const Slice* end, + const Comparator* cmp) + : VectorIterator(keys, values), start_(start), end_(end), cmp_(cmp) { + assert(cmp_); + } + + bool NextAndGetResult(IterateResult* result) override { + assert(Valid()); + assert(result); + + Next(); + + if (!Valid()) { + return false; + } + + result->key = key(); + result->bound_check_result = UpperBoundCheckResult(); + result->value_prepared = true; + + return true; + } + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + + if (!start_) { + return false; + } + + return cmp_->Compare(key(), *start_) < 0; + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + + if (!end_) { + return IterBoundCheck::kInbound; + } + + return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound + : IterBoundCheck::kInbound; + } + + private: + const Slice* start_; + const Slice* end_; + const Comparator* cmp_; +}; + +class ClippingIteratorTest + : public ::testing::Test, + public ::testing::WithParamInterface> {}; + +TEST_P(ClippingIteratorTest, Clip) { + const std::vector keys{"key0", "key1", "key2", "key3", "key4", + "key5", "key6", "key7", "key8", "key9"}; + const std::vector values{ + "unused0", "value1", "value2", "value3", "unused4", + "unused5", "unused6", "unused7", "unused8", "unused9"}; + + assert(keys.size() == values.size()); + + // Note: the input always contains key1, key2, and key3; however, the clipping + // window is based on the test parameters: its left edge is a value in the + // range [0, 4], and its size is a value in the range [0, 5] + const std::vector input_keys{keys[1], keys[2], keys[3]}; + const std::vector input_values{values[1], values[2], values[3]}; + + const bool use_bounds_checking_vec_it = std::get<0>(GetParam()); + + const size_t clip_start_idx = std::get<1>(GetParam()); + const size_t clip_window_size = std::get<2>(GetParam()); + const size_t clip_end_idx = clip_start_idx + clip_window_size; + + const Slice start(keys[clip_start_idx]); + const Slice end(keys[clip_end_idx]); + + std::unique_ptr input( + use_bounds_checking_vec_it + ? new BoundsCheckingVectorIterator(input_keys, input_values, &start, + &end, BytewiseComparator()) + : new test::VectorIterator(input_keys, input_values)); + + ClippingIterator clip(input.get(), &start, &end, BytewiseComparator()); + + // The range the clipping iterator should return values from. This is + // essentially the intersection of the input range [1, 4) and the clipping + // window [clip_start_idx, clip_end_idx) + const size_t data_start_idx = + std::max(clip_start_idx, static_cast(1)); + const size_t data_end_idx = std::min(clip_end_idx, static_cast(4)); + + // Range is empty; all Seeks should fail + if (data_start_idx >= data_end_idx) { + clip.SeekToFirst(); + ASSERT_FALSE(clip.Valid()); + + clip.SeekToLast(); + ASSERT_FALSE(clip.Valid()); + + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + ASSERT_FALSE(clip.Valid()); + + clip.SeekForPrev(keys[i]); + ASSERT_FALSE(clip.Valid()); + } + + return; + } + + // Range is non-empty; call SeekToFirst and iterate forward + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + clip.Next(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Next(); + ASSERT_FALSE(clip.Valid()); + + // Do it again using NextAndGetResult + clip.SeekToFirst(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) { + IterateResult result; + ASSERT_TRUE(clip.NextAndGetResult(&result)); + ASSERT_EQ(result.key, keys[i]); + ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + IterateResult result; + ASSERT_FALSE(clip.NextAndGetResult(&result)); + ASSERT_FALSE(clip.Valid()); + + // Call SeekToLast and iterate backward + clip.SeekToLast(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + + for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) { + clip.Prev(); + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + + clip.Prev(); + ASSERT_FALSE(clip.Valid()); + + // Call Seek/SeekForPrev for all keys; Seek should return the smallest key + // which is >= the target; SeekForPrev should return the largest key which is + // <= the target + for (size_t i = 0; i < keys.size(); ++i) { + clip.Seek(keys[i]); + + if (i < data_start_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_start_idx]); + ASSERT_EQ(clip.value(), values[data_start_idx]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_FALSE(clip.Valid()); + } + + clip.SeekForPrev(keys[i]); + + if (i < data_start_idx) { + ASSERT_FALSE(clip.Valid()); + } else if (i < data_end_idx) { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[i]); + ASSERT_EQ(clip.value(), values[i]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } else { + ASSERT_TRUE(clip.Valid()); + ASSERT_EQ(clip.key(), keys[data_end_idx - 1]); + ASSERT_EQ(clip.value(), values[data_end_idx - 1]); + ASSERT_FALSE(clip.MayBeOutOfLowerBound()); + ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound); + } + } +} + +INSTANTIATE_TEST_CASE_P( + ClippingIteratorTest, ClippingIteratorTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Range(static_cast(0), static_cast(5)), + ::testing::Range(static_cast(0), static_cast(6)))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 60e2681faa1..57f814fbc75 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -204,27 +204,24 @@ bool Compaction::IsFullCompaction( return num_files_in_compaction == total_num_files; } -Compaction::Compaction(VersionStorageInfo* vstorage, - const ImmutableCFOptions& _immutable_cf_options, - const MutableCFOptions& _mutable_cf_options, - const MutableDBOptions& _mutable_db_options, - std::vector _inputs, - int _output_level, uint64_t _target_file_size, - uint64_t _max_compaction_bytes, uint32_t _output_path_id, - CompressionType _compression, - CompressionOptions _compression_opts, - uint32_t _max_subcompactions, - std::vector _grandparents, - bool _manual_compaction, double _score, - bool _deletion_compaction, - CompactionReason _compaction_reason) +Compaction::Compaction( + VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, + const MutableCFOptions& _mutable_cf_options, + const MutableDBOptions& _mutable_db_options, + std::vector _inputs, int _output_level, + uint64_t _target_file_size, uint64_t _max_compaction_bytes, + uint32_t _output_path_id, CompressionType _compression, + CompressionOptions _compression_opts, uint32_t _max_subcompactions, + std::vector _grandparents, bool _manual_compaction, + double _score, bool _deletion_compaction, + CompactionReason _compaction_reason) : input_vstorage_(vstorage), start_level_(_inputs[0].level), output_level_(_output_level), max_output_file_size_(_target_file_size), max_compaction_bytes_(_max_compaction_bytes), max_subcompactions_(_max_subcompactions), - immutable_cf_options_(_immutable_cf_options), + immutable_options_(_immutable_options), mutable_cf_options_(_mutable_cf_options), input_version_(nullptr), number_levels_(vstorage->num_levels()), @@ -248,12 +245,6 @@ Compaction::Compaction(VersionStorageInfo* vstorage, if (max_subcompactions_ == 0) { max_subcompactions_ = _mutable_db_options.max_subcompactions; } - if (!bottommost_level_) { - // Currently we only enable dictionary compression during compaction to the - // bottommost level. - output_compression_opts_.max_dict_bytes = 0; - output_compression_opts_.zstd_max_train_bytes = 0; - } #ifndef NDEBUG for (size_t i = 1; i < inputs_.size(); ++i) { @@ -284,7 +275,7 @@ Compaction::~Compaction() { bool Compaction::InputCompressionMatchesOutput() const { int base_level = input_vstorage_->base_level(); - bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_, + bool matches = (GetCompressionType(immutable_options_, input_vstorage_, mutable_cf_options_, start_level_, base_level) == output_compression_); if (matches) { @@ -309,8 +300,8 @@ bool Compaction::IsTrivialMove() const { } if (is_manual_compaction_ && - (immutable_cf_options_.compaction_filter != nullptr || - immutable_cf_options_.compaction_filter_factory != nullptr)) { + (immutable_options_.compaction_filter != nullptr || + immutable_options_.compaction_filter_factory != nullptr)) { // This is a manual compaction and we have a compaction filter that should // be executed, we cannot do a trivial move return false; @@ -383,7 +374,13 @@ bool Compaction::KeyNotExistsBeyondOutputLevel( auto* f = files[level_ptrs->at(lvl)]; if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // In the presence of user-defined timestamp, we may need to handle + // the case in which f->smallest.user_key() (including ts) has the + // same user key, but the ts part is smaller. If so, + // Compare(user_key, f->smallest.user_key()) returns -1. + // That's why we need CompareWithoutTimestamp(). + if (user_cmp->CompareWithoutTimestamp(user_key, + f->smallest.user_key()) >= 0) { // Key falls in this file's range, so it may // exist beyond output level return false; @@ -512,14 +509,14 @@ uint64_t Compaction::OutputFilePreallocationSize() const { } if (max_output_file_size_ != port::kMaxUint64 && - (immutable_cf_options_.compaction_style == kCompactionStyleLevel || + (immutable_options_.compaction_style == kCompactionStyleLevel || output_level() > 0)) { preallocation_size = std::min(max_output_file_size_, preallocation_size); } // Over-estimate slightly so we don't end up just barely crossing // the threshold - // No point to prellocate more than 1GB. + // No point to preallocate more than 1GB. return std::min(uint64_t{1073741824}, preallocation_size + (preallocation_size / 10)); } @@ -529,16 +526,23 @@ std::unique_ptr Compaction::CreateCompactionFilter() const { return nullptr; } + if (!cfd_->ioptions() + ->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kCompaction)) { + return nullptr; + } + CompactionFilter::Context context; context.is_full_compaction = is_full_compaction_; context.is_manual_compaction = is_manual_compaction_; context.column_family_id = cfd_->GetID(); + context.reason = TableFileCreationReason::kCompaction; return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( context); } std::unique_ptr Compaction::CreateSstPartitioner() const { - if (!immutable_cf_options_.sst_partitioner_factory) { + if (!immutable_options_.sst_partitioner_factory) { return nullptr; } @@ -548,8 +552,7 @@ std::unique_ptr Compaction::CreateSstPartitioner() const { context.output_level = output_level_; context.smallest_user_key = smallest_user_key_; context.largest_user_key = largest_user_key_; - return immutable_cf_options_.sst_partitioner_factory->CreatePartitioner( - context); + return immutable_options_.sst_partitioner_factory->CreatePartitioner(context); } bool Compaction::IsOutputLevelEmpty() const { @@ -560,6 +563,14 @@ bool Compaction::ShouldFormSubcompactions() const { if (max_subcompactions_ <= 1 || cfd_ == nullptr) { return false; } + + // Note: the subcompaction boundary picking logic does not currently guarantee + // that all user keys that differ only by timestamp get processed by the same + // subcompaction. + if (cfd_->user_comparator()->timestamp_size() > 0) { + return false; + } + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 && !IsOutputLevelEmpty(); diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index d25ffd603a5..eda9bf002f4 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -70,7 +70,7 @@ class CompactionFilter; class Compaction { public: Compaction(VersionStorageInfo* input_version, - const ImmutableCFOptions& immutable_cf_options, + const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, std::vector inputs, int output_level, @@ -162,7 +162,7 @@ class Compaction { CompressionType output_compression() const { return output_compression_; } // What compression options for output - CompressionOptions output_compression_opts() const { + const CompressionOptions& output_compression_opts() const { return output_compression_opts_; } @@ -223,10 +223,10 @@ class Compaction { // How many total levels are there? int number_levels() const { return number_levels_; } - // Return the ImmutableCFOptions that should be used throughout the compaction + // Return the ImmutableOptions that should be used throughout the compaction // procedure - const ImmutableCFOptions* immutable_cf_options() const { - return &immutable_cf_options_; + const ImmutableOptions* immutable_options() const { + return &immutable_options_; } // Return the MutableCFOptions that should be used throughout the compaction @@ -330,7 +330,7 @@ class Compaction { uint64_t max_output_file_size_; uint64_t max_compaction_bytes_; uint32_t max_subcompactions_; - const ImmutableCFOptions immutable_cf_options_; + const ImmutableOptions immutable_options_; const MutableCFOptions mutable_cf_options_; Version* input_version_; VersionEdit edit_; @@ -341,7 +341,7 @@ class Compaction { const uint32_t output_path_id_; CompressionType output_compression_; CompressionOptions output_compression_opts_; - // If true, then the comaction can be done by simply deleting input files. + // If true, then the compaction can be done by simply deleting input files. const bool deletion_compaction_; // Compaction input files organized by level. Constant after construction diff --git a/db/compaction/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h index 963c1d8eb49..cb7b82c65ab 100644 --- a/db/compaction/compaction_iteration_stats.h +++ b/db/compaction/compaction_iteration_stats.h @@ -34,4 +34,8 @@ struct CompactionIterationStats { // Single-Delete diagnostics for exceptional situations uint64_t num_single_del_fallthru = 0; uint64_t num_single_del_mismatch = 0; + + // Blob related statistics + uint64_t num_blobs_read = 0; + uint64_t total_blob_bytes_read = 0; }; diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 4555ec56832..e48818fd086 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -5,9 +5,11 @@ #include "db/compaction/compaction_iterator.h" -#include +#include +#include #include "db/blob/blob_file_builder.h" +#include "db/blob/blob_index.h" #include "db/snapshot_checker.h" #include "port/likely.h" #include "rocksdb/listener.h" @@ -31,7 +33,6 @@ (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq)))) namespace ROCKSDB_NAMESPACE { - CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, @@ -44,16 +45,19 @@ CompactionIterator::CompactionIterator( const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, const std::atomic* manual_compaction_paused, - const std::shared_ptr info_log) + const std::atomic* manual_compaction_canceled, + const std::shared_ptr info_log, + const std::string* full_history_ts_low) : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, report_detailed_time, expect_valid_internal_key, range_del_agg, blob_file_builder, allow_data_in_errors, std::unique_ptr( - compaction ? new CompactionProxy(compaction) : nullptr), + compaction ? new RealCompaction(compaction) : nullptr), compaction_filter, shutting_down, preserve_deletes_seqnum, - manual_compaction_paused, info_log) {} + manual_compaction_paused, manual_compaction_canceled, info_log, + full_history_ts_low) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -68,14 +72,20 @@ CompactionIterator::CompactionIterator( const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, const std::atomic* manual_compaction_paused, - const std::shared_ptr info_log) - : input_(input), + const std::atomic* manual_compaction_canceled, + const std::shared_ptr info_log, + const std::string* full_history_ts_low) + : input_( + input, cmp, + compaction == + nullptr), // Now only need to count number of entries in flush. cmp_(cmp), merge_helper_(merge_helper), snapshots_(snapshots), earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), snapshot_checker_(snapshot_checker), env_(env), + clock_(env_->GetSystemClock().get()), report_detailed_time_(report_detailed_time), expect_valid_internal_key_(expect_valid_internal_key), range_del_agg_(range_del_agg), @@ -84,14 +94,20 @@ CompactionIterator::CompactionIterator( compaction_filter_(compaction_filter), shutting_down_(shutting_down), manual_compaction_paused_(manual_compaction_paused), + manual_compaction_canceled_(manual_compaction_canceled), preserve_deletes_seqnum_(preserve_deletes_seqnum), + info_log_(info_log), + allow_data_in_errors_(allow_data_in_errors), + timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0), + full_history_ts_low_(full_history_ts_low), current_user_key_sequence_(0), current_user_key_snapshot_(0), merge_out_iter_(merge_helper_), + blob_garbage_collection_cutoff_file_number_( + ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())), current_key_committed_(false), - info_log_(info_log), - allow_data_in_errors_(allow_data_in_errors) { - assert(compaction_filter_ == nullptr || compaction_ != nullptr); + cmp_with_history_ts_low_(0), + level_(compaction_ == nullptr ? 0 : compaction_->level()) { assert(snapshots_ != nullptr); bottommost_level_ = compaction_ == nullptr ? false @@ -117,14 +133,16 @@ CompactionIterator::CompactionIterator( for (size_t i = 1; i < snapshots_->size(); ++i) { assert(snapshots_->at(i - 1) < snapshots_->at(i)); } + assert(timestamp_size_ == 0 || !full_history_ts_low_ || + timestamp_size_ == full_history_ts_low_->size()); #endif - input_->SetPinnedItersMgr(&pinned_iters_mgr_); + input_.SetPinnedItersMgr(&pinned_iters_mgr_); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } CompactionIterator::~CompactionIterator() { - // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime - input_->SetPinnedItersMgr(nullptr); + // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime + input_.SetPinnedItersMgr(nullptr); } void CompactionIterator::ResetRecordCounts() { @@ -151,13 +169,13 @@ void CompactionIterator::Next() { if (merge_out_iter_.Valid()) { key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); - Status s = ParseInternalKey(key_, &ikey_); + Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. assert(s.ok()); if (!s.ok()) { - ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", - key_.ToString(true).c_str()); + ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s", + s.getState()); } // Keep current_key_ in sync. @@ -177,7 +195,7 @@ void CompactionIterator::Next() { // Only advance the input iterator if there is no merge output and the // iterator is not already at the next record. if (!at_next_) { - input_->Next(); + AdvanceInputIter(); } NextFromInput(); } @@ -192,100 +210,172 @@ void CompactionIterator::Next() { bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until) { - if (compaction_filter_ != nullptr && - (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) { - // If the user has specified a compaction filter and the sequence - // number is greater than any external snapshot, then invoke the - // filter. If the return value of the compaction filter is true, - // replace the entry with a deletion marker. - CompactionFilter::Decision filter; - compaction_filter_value_.clear(); - compaction_filter_skip_until_.Clear(); - CompactionFilter::ValueType value_type = - ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue - : CompactionFilter::ValueType::kBlobIndex; - // Hack: pass internal key to BlobIndexCompactionFilter since it needs - // to get sequence number. - Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_; - { - StopWatchNano timer(env_, report_detailed_time_); + if (!compaction_filter_ || + (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) { + return true; + } + bool error = false; + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. If the return value of the compaction filter is true, + // replace the entry with a deletion marker. + CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined; + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + CompactionFilter::ValueType value_type = + ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue + : CompactionFilter::ValueType::kBlobIndex; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs + // to get sequence number. + assert(compaction_filter_); + Slice& filter_key = + (ikey_.type == kTypeValue || + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) + ? ikey_.user_key + : key_; + { + StopWatchNano timer(clock_, report_detailed_time_); + if (kTypeBlobIndex == ikey_.type) { + blob_value_.Reset(); + filter = compaction_filter_->FilterBlobByKey( + level_, filter_key, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); + if (CompactionFilter::Decision::kUndetermined == filter && + !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + // For integrated BlobDB impl, CompactionIterator reads blob value. + // For Stacked BlobDB impl, the corresponding CompactionFilter's + // FilterV2 method should read the blob value. + BlobIndex blob_index; + Status s = blob_index.DecodeFrom(value_); + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + if (blob_index.HasTTL() || blob_index.IsInlined()) { + status_ = Status::Corruption("Unexpected TTL/inlined blob index"); + valid_ = false; + return false; + } + if (compaction_ == nullptr) { + status_ = + Status::Corruption("Unexpected blob index outside of compaction"); + valid_ = false; + return false; + } + const Version* const version = compaction_->input_version(); + assert(version); + + uint64_t bytes_read = 0; + s = version->GetBlob(ReadOptions(), ikey_.user_key, blob_index, + &blob_value_, &bytes_read); + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + value_type = CompactionFilter::ValueType::kValue; + } + } + if (CompactionFilter::Decision::kUndetermined == filter) { filter = compaction_filter_->FilterV2( - compaction_->level(), filter_key, value_type, value_, - &compaction_filter_value_, compaction_filter_skip_until_.rep()); - iter_stats_.total_filter_time += - env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + level_, filter_key, value_type, + blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_, + compaction_filter_skip_until_.rep()); } + iter_stats_.total_filter_time += + env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; + } - if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && - cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= - 0) { - // Can't skip to a key smaller than the current one. - // Keep the key as per FilterV2 documentation. - filter = CompactionFilter::Decision::kKeep; - } + if (CompactionFilter::Decision::kUndetermined == filter) { + // Should not reach here, since FilterV2 should never return kUndetermined. + status_ = + Status::NotSupported("FilterV2() should never return kUndetermined"); + valid_ = false; + return false; + } - if (filter == CompactionFilter::Decision::kRemove) { - // convert the current key to a delete; key_ is pointing into - // current_key_ at this point, so updating current_key_ updates key() - ikey_.type = kTypeDeletion; - current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); - // no value associated with delete - value_.clear(); - iter_stats_.num_record_drop_user++; - } else if (filter == CompactionFilter::Decision::kChangeValue) { - if (ikey_.type == kTypeBlobIndex) { - // value transfer from blob file to inlined data - ikey_.type = kTypeValue; - current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); - } - value_ = compaction_filter_value_; - } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { - *need_skip = true; - compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, - kValueTypeForSeek); - *skip_until = compaction_filter_skip_until_.Encode(); - } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) { - if (ikey_.type == kTypeValue) { - // value transfer from inlined data to blob file - ikey_.type = kTypeBlobIndex; - current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); - } - value_ = compaction_filter_value_; - } else if (filter == CompactionFilter::Decision::kIOError) { - status_ = - Status::IOError("Failed to access blob during compaction filter"); + if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= + 0) { + // Can't skip to a key smaller than the current one. + // Keep the key as per FilterV2 documentation. + filter = CompactionFilter::Decision::kKeep; + } + + if (filter == CompactionFilter::Decision::kRemove) { + // convert the current key to a delete; key_ is pointing into + // current_key_ at this point, so updating current_key_ updates key() + ikey_.type = kTypeDeletion; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion); + // no value associated with delete + value_.clear(); + iter_stats_.num_record_drop_user++; + } else if (filter == CompactionFilter::Decision::kChangeValue) { + if (ikey_.type == kTypeBlobIndex) { + // value transfer from blob file to inlined data + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + *need_skip = true; + compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, + kValueTypeForSeek); + *skip_until = compaction_filter_skip_until_.Encode(); + } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) { + // Only the StackableDB-based BlobDB impl's compaction filter should return + // kChangeBlobIndex. Decision about rewriting blob and changing blob index + // in the integrated BlobDB impl is made in subsequent call to + // PrepareOutput() and its callees. + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "Only stacked BlobDB's internal compaction filter can return " + "kChangeBlobIndex."); + valid_ = false; + return false; + } + if (ikey_.type == kTypeValue) { + // value transfer from inlined data to blob file + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_ = compaction_filter_value_; + } else if (filter == CompactionFilter::Decision::kIOError) { + if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + status_ = Status::NotSupported( + "CompactionFilter for integrated BlobDB should not return kIOError"); + valid_ = false; return false; } + status_ = Status::IOError("Failed to access blob during compaction filter"); + error = true; } - return true; + return !error; } void CompactionIterator::NextFromInput() { at_next_ = false; valid_ = false; - while (!valid_ && input_->Valid() && !IsPausingManualCompaction() && + while (!valid_ && input_.Valid() && !IsPausingManualCompaction() && !IsShuttingDown()) { - key_ = input_->key(); - value_ = input_->value(); + key_ = input_.key(); + value_ = input_.value(); iter_stats_.num_input_records++; - Status pikStatus = ParseInternalKey(key_, &ikey_); - if (!pikStatus.ok()) { + Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); + if (!pik_status.ok()) { iter_stats_.num_input_corrupt_records++; // If `expect_valid_internal_key_` is false, return the corrupted key // and let the caller decide what to do with it. - // TODO(noetzli): We should have a more elegant solution for this. if (expect_valid_internal_key_) { - std::string msg("Corrupted internal key not expected."); - if (allow_data_in_errors_) { - msg.append(" Corrupt key: " + ikey_.user_key.ToString(/*hex=*/true) + - ". "); - msg.append("key type: " + std::to_string(ikey_.type) + "."); - msg.append("seq: " + std::to_string(ikey_.sequence) + "."); - } - status_ = Status::Corruption(msg.c_str()); + status_ = pik_status; return; } key_ = current_key_.SetInternalKey(key_); @@ -298,7 +388,8 @@ void CompactionIterator::NextFromInput() { TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); // Update input statistics - if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || + ikey_.type == kTypeDeletionWithTimestamp) { iter_stats_.num_input_deletion_records++; } iter_stats_.total_input_raw_key_bytes += key_.size(); @@ -311,19 +402,54 @@ void CompactionIterator::NextFromInput() { // merge_helper_->compaction_filter_skip_until_. Slice skip_until; + bool user_key_equal_without_ts = false; + int cmp_ts = 0; + if (has_current_user_key_) { + user_key_equal_without_ts = + cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_); + // if timestamp_size_ > 0, then curr_ts_ has been initialized by a + // previous key. + cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp( + ExtractTimestampFromUserKey( + ikey_.user_key, timestamp_size_), + curr_ts_) + : 0; + } + // Check whether the user key changed. After this if statement current_key_ // is a copy of the current input key (maybe converted to a delete by the // compaction filter). ikey_.user_key is pointing to the copy. - if (!has_current_user_key_ || - !cmp_->Equal(ikey_.user_key, current_user_key_)) { + if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) { // First occurrence of this user key // Copy key for output key_ = current_key_.SetInternalKey(key_, &ikey_); + + // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use + // in next iteration to compare with the timestamp of next key. + UpdateTimestampAndCompareWithFullHistoryLow(); + + // If + // (1) !has_current_user_key_, OR + // (2) timestamp is disabled, OR + // (3) all history will be preserved, OR + // (4) user key (excluding timestamp) is different from previous key, OR + // (5) timestamp is NO older than *full_history_ts_low_ + // then current_user_key_ must be treated as a different user key. + // This means, if a user key (excluding ts) is the same as the previous + // user key, and its ts is older than *full_history_ts_low_, then we + // consider this key for GC, e.g. it may be dropped if certain conditions + // match. + if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ || + !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0) { + // Initialize for future comparison for rule (A) and etc. + current_user_key_sequence_ = kMaxSequenceNumber; + current_user_key_snapshot_ = 0; + has_current_user_key_ = true; + } current_user_key_ = ikey_.user_key; - has_current_user_key_ = true; + has_outputted_key_ = false; - current_user_key_sequence_ = kMaxSequenceNumber; - current_user_key_snapshot_ = 0; + current_key_committed_ = KeyCommitted(ikey_.sequence); // Apply the compaction filter to the first committed version of the user @@ -381,8 +507,8 @@ void CompactionIterator::NextFromInput() { // In the previous iteration we encountered a single delete that we could // not compact out. We will keep this Put, but can drop it's data. // (See Optimization 3, below.) - assert(ikey_.type == kTypeValue); - if (ikey_.type != kTypeValue) { + assert(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex); + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex) { ROCKS_LOG_FATAL(info_log_, "Unexpected key type %d for compaction output", ikey_.type); @@ -395,6 +521,11 @@ void CompactionIterator::NextFromInput() { current_user_key_snapshot_, last_snapshot); } + if (ikey_.type == kTypeBlobIndex) { + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } + value_.clear(); valid_ = true; clear_and_output_next_key_ = false; @@ -434,12 +565,13 @@ void CompactionIterator::NextFromInput() { // The easiest way to process a SingleDelete during iteration is to peek // ahead at the next key. ParsedInternalKey next_ikey; - input_->Next(); + AdvanceInputIter(); // Check whether the next key exists, is not corrupt, and is the same key // as the single delete. - if (input_->Valid() && - ParseInternalKey(input_->key(), &next_ikey) == Status::OK() && + if (input_.Valid() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { // Check whether the next key belongs to the same snapshot as the // SingleDelete. @@ -452,7 +584,7 @@ void CompactionIterator::NextFromInput() { // to handle the second SingleDelete // First SingleDelete has been skipped since we already called - // input_->Next(). + // input_.Next(). ++iter_stats_.num_record_drop_obsolete; ++iter_stats_.num_single_del_mismatch; } else if (has_outputted_key_ || @@ -474,9 +606,9 @@ void CompactionIterator::NextFromInput() { ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_obsolete; - // Already called input_->Next() once. Call it a second time to + // Already called input_.Next() once. Call it a second time to // skip past the second key. - input_->Next(); + AdvanceInputIter(); } else { // Found a matching value, but we cannot drop both keys since // there is an earlier snapshot and we need to leave behind a record @@ -543,9 +675,12 @@ void CompactionIterator::NextFromInput() { last_sequence, current_user_key_sequence_); } - ++iter_stats_.num_record_drop_hidden; // (A) - input_->Next(); - } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion && + ++iter_stats_.num_record_drop_hidden; // rule (A) + AdvanceInputIter(); + } else if (compaction_ != nullptr && + (ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikeyNotNeededForIncrementalSnapshot() && compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key, @@ -569,35 +704,47 @@ void CompactionIterator::NextFromInput() { // given that: // (1) The deletion is earlier than earliest_write_conflict_snapshot, and // (2) No value exist earlier than the deletion. + // + // Note also that a deletion marker of type kTypeDeletionWithTimestamp + // will be treated as a different user key unless the timestamp is older + // than *full_history_ts_low_. ++iter_stats_.num_record_drop_obsolete; if (!bottommost_level_) { ++iter_stats_.num_optimized_del_drop_obsolete; } - input_->Next(); - } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ && - ikeyNotNeededForIncrementalSnapshot()) { + AdvanceInputIter(); + } else if ((ikey_.type == kTypeDeletion || + (ikey_.type == kTypeDeletionWithTimestamp && + cmp_with_history_ts_low_ < 0)) && + bottommost_level_ && ikeyNotNeededForIncrementalSnapshot()) { // Handle the case where we have a delete key at the bottom most level // We can skip outputting the key iff there are no subsequent puts for this // key assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( ikey_.user_key, &level_ptrs_)); ParsedInternalKey next_ikey; - input_->Next(); - // Skip over all versions of this key that happen to occur in the same snapshot - // range as the delete + AdvanceInputIter(); + // Skip over all versions of this key that happen to occur in the same + // snapshot range as the delete. + // + // Note that a deletion marker of type kTypeDeletionWithTimestamp will be + // considered to have a different user key unless the timestamp is older + // than *full_history_ts_low_. while (!IsPausingManualCompaction() && !IsShuttingDown() && - input_->Valid() && - (ParseInternalKey(input_->key(), &next_ikey) == Status::OK()) && - cmp_->Equal(ikey_.user_key, next_ikey.user_key) && + input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && (prev_snapshot == 0 || DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) { - input_->Next(); + AdvanceInputIter(); } // If you find you still need to output a row with this key, we need to output the // delete too - if (input_->Valid() && - (ParseInternalKey(input_->key(), &next_ikey) == Status::OK()) && - cmp_->Equal(ikey_.user_key, next_ikey.user_key)) { + if (input_.Valid() && + (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok()) && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { valid_ = true; at_next_ = true; } @@ -613,8 +760,9 @@ void CompactionIterator::NextFromInput() { // have hit (A) // We encapsulate the merge related state machine in a different // object to minimize change to the existing flow. - Status s = merge_helper_->MergeUntil(input_, range_del_agg_, - prev_snapshot, bottommost_level_); + Status s = + merge_helper_->MergeUntil(&input_, range_del_agg_, prev_snapshot, + bottommost_level_, allow_data_in_errors_); merge_out_iter_.SeekToFirst(); if (!s.ok() && !s.IsMergeInProgress()) { @@ -625,13 +773,13 @@ void CompactionIterator::NextFromInput() { // These will be correctly set below. key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); - pikStatus = ParseInternalKey(key_, &ikey_); + pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to valid. - assert(pikStatus.ok()); - if (!pikStatus.ok()) { - ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", - key_.ToString(true).c_str()); + assert(pik_status.ok()); + if (!pik_status.ok()) { + ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s", + pik_status.getState()); } // Keep current_key_ in sync. current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); @@ -657,14 +805,14 @@ void CompactionIterator::NextFromInput() { if (should_delete) { ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_range_del; - input_->Next(); + AdvanceInputIter(); } else { valid_ = true; } } if (need_skip) { - input_->Seek(skip_until); + SkipUntil(skip_until); } } @@ -677,42 +825,144 @@ void CompactionIterator::NextFromInput() { } } -void CompactionIterator::PrepareOutput() { - if (valid_) { - if (ikey_.type == kTypeValue) { - if (blob_file_builder_) { - blob_index_.clear(); - const Status s = - blob_file_builder_->Add(user_key(), value_, &blob_index_); +bool CompactionIterator::ExtractLargeValueIfNeededImpl() { + if (!blob_file_builder_) { + return false; + } - if (!s.ok()) { - status_ = s; - valid_ = false; - } else if (!blob_index_.empty()) { - value_ = blob_index_; - ikey_.type = kTypeBlobIndex; - current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); - } + blob_index_.clear(); + const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_); + + if (!s.ok()) { + status_ = s; + valid_ = false; + + return false; + } + + if (blob_index_.empty()) { + return false; + } + + value_ = blob_index_; + + return true; +} + +void CompactionIterator::ExtractLargeValueIfNeeded() { + assert(ikey_.type == kTypeValue); + + if (!ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeBlobIndex; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); +} + +void CompactionIterator::GarbageCollectBlobIfNeeded() { + assert(ikey_.type == kTypeBlobIndex); + + if (!compaction_) { + return; + } + + // GC for integrated BlobDB + if (compaction_->enable_blob_garbage_collection()) { + BlobIndex blob_index; + + { + const Status s = blob_index.DecodeFrom(value_); + + if (!s.ok()) { + status_ = s; + valid_ = false; + + return; } - } else if (ikey_.type == kTypeBlobIndex) { - if (compaction_filter_) { - const auto blob_decision = compaction_filter_->PrepareBlobOutput( - user_key(), value_, &compaction_filter_value_); + } - if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { - status_ = Status::Corruption( - "Corrupted blob reference encountered during GC"); - valid_ = false; - } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) { - status_ = Status::IOError("Could not relocate blob during GC"); - valid_ = false; - } else if (blob_decision == - CompactionFilter::BlobDecision::kChangeValue) { - value_ = compaction_filter_value_; - } + if (blob_index.IsInlined() || blob_index.HasTTL()) { + status_ = Status::Corruption("Unexpected TTL/inlined blob index"); + valid_ = false; + + return; + } + + if (blob_index.file_number() >= + blob_garbage_collection_cutoff_file_number_) { + return; + } + + const Version* const version = compaction_->input_version(); + assert(version); + + uint64_t bytes_read = 0; + + { + const Status s = version->GetBlob(ReadOptions(), user_key(), blob_index, + &blob_value_, &bytes_read); + + if (!s.ok()) { + status_ = s; + valid_ = false; + + return; } } + ++iter_stats_.num_blobs_read; + iter_stats_.total_blob_bytes_read += bytes_read; + + value_ = blob_value_; + + if (ExtractLargeValueIfNeededImpl()) { + return; + } + + ikey_.type = kTypeValue; + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + + return; + } + + // GC for stacked BlobDB + if (compaction_filter_ && + compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { + const auto blob_decision = compaction_filter_->PrepareBlobOutput( + user_key(), value_, &compaction_filter_value_); + + if (blob_decision == CompactionFilter::BlobDecision::kCorruption) { + status_ = + Status::Corruption("Corrupted blob reference encountered during GC"); + valid_ = false; + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kIOError) { + status_ = Status::IOError("Could not relocate blob during GC"); + valid_ = false; + + return; + } + + if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) { + value_ = compaction_filter_value_; + + return; + } + } +} + +void CompactionIterator::PrepareOutput() { + if (valid_) { + if (ikey_.type == kTypeValue) { + ExtractLargeValueIfNeeded(); + } else if (ikey_.type == kTypeBlobIndex) { + GarbageCollectBlobIfNeeded(); + } + // Zeroing out the sequence number leads to better compression. // If this is the bottommost level (no files in lower levels) // and the earliest snapshot is larger than this seqno @@ -735,7 +985,18 @@ void CompactionIterator::PrepareOutput() { ikey_.type); } ikey_.sequence = 0; - current_key_.UpdateInternalKey(0, ikey_.type); + if (!timestamp_size_) { + current_key_.UpdateInternalKey(0, ikey_.type); + } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) { + // We can also zero out timestamp for better compression. + // For the same user key (excluding timestamp), the timestamp-based + // history can be collapsed to save some space if the timestamp is + // older than *full_history_ts_low_. + const std::string kTsMin(timestamp_size_, static_cast(0)); + const Slice ts_slice = kTsMin; + ikey_.SetTimestamp(ts_slice); + current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice); + } } } } @@ -827,4 +1088,30 @@ bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) { return in_snapshot == SnapshotCheckerResult::kInSnapshot; } +uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction) { + if (!compaction) { + return 0; + } + + if (!compaction->enable_blob_garbage_collection()) { + return 0; + } + + Version* const version = compaction->input_version(); + assert(version); + + const VersionStorageInfo* const storage_info = version->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + auto it = blob_files.begin(); + std::advance( + it, compaction->blob_garbage_collection_age_cutoff() * blob_files.size()); + + return it != blob_files.end() ? it->first + : std::numeric_limits::max(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 29dedd3c719..65df5c44498 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include #include @@ -23,76 +24,166 @@ namespace ROCKSDB_NAMESPACE { class BlobFileBuilder; +// A wrapper of internal iterator whose purpose is to count how +// many entries there are in the iterator. +class SequenceIterWrapper : public InternalIterator { + public: + SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp, + bool need_count_entries) + : icmp_(cmp, /*named=*/false), + inner_iter_(iter), + need_count_entries_(need_count_entries) {} + bool Valid() const override { return inner_iter_->Valid(); } + Status status() const override { return inner_iter_->status(); } + void Next() override { + num_itered_++; + inner_iter_->Next(); + } + void Seek(const Slice& target) override { + if (!need_count_entries_) { + inner_iter_->Seek(target); + } else { + // For flush cases, we need to count total number of entries, so we + // do Next() rather than Seek(). + while (inner_iter_->Valid() && + icmp_.Compare(inner_iter_->key(), target) < 0) { + Next(); + } + } + } + Slice key() const override { return inner_iter_->key(); } + Slice value() const override { return inner_iter_->value(); } + + // Unused InternalIterator methods + void SeekToFirst() override { assert(false); } + void Prev() override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + uint64_t num_itered() const { return num_itered_; } + + private: + InternalKeyComparator icmp_; + InternalIterator* inner_iter_; // not owned + uint64_t num_itered_ = 0; + bool need_count_entries_; +}; + class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what // CompactionIterator uses. Tests can override it. class CompactionProxy { public: - explicit CompactionProxy(const Compaction* compaction) - : compaction_(compaction) {} - virtual ~CompactionProxy() = default; - virtual int level(size_t /*compaction_input_level*/ = 0) const { - return compaction_->level(); - } + + virtual int level() const = 0; + virtual bool KeyNotExistsBeyondOutputLevel( - const Slice& user_key, std::vector* level_ptrs) const { + const Slice& user_key, std::vector* level_ptrs) const = 0; + + virtual bool bottommost_level() const = 0; + + virtual int number_levels() const = 0; + + virtual Slice GetLargestUserKey() const = 0; + + virtual bool allow_ingest_behind() const = 0; + + virtual bool preserve_deletes() const = 0; + + virtual bool enable_blob_garbage_collection() const = 0; + + virtual double blob_garbage_collection_age_cutoff() const = 0; + + virtual Version* input_version() const = 0; + }; + + class RealCompaction : public CompactionProxy { + public: + explicit RealCompaction(const Compaction* compaction) + : compaction_(compaction) { + assert(compaction_); + assert(compaction_->immutable_options()); + assert(compaction_->mutable_cf_options()); + } + + int level() const override { return compaction_->level(); } + + bool KeyNotExistsBeyondOutputLevel( + const Slice& user_key, std::vector* level_ptrs) const override { return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs); } - virtual bool bottommost_level() const { + + bool bottommost_level() const override { return compaction_->bottommost_level(); } - virtual int number_levels() const { return compaction_->number_levels(); } - virtual Slice GetLargestUserKey() const { + + int number_levels() const override { return compaction_->number_levels(); } + + Slice GetLargestUserKey() const override { return compaction_->GetLargestUserKey(); } - virtual bool allow_ingest_behind() const { - return compaction_->immutable_cf_options()->allow_ingest_behind; + + bool allow_ingest_behind() const override { + return compaction_->immutable_options()->allow_ingest_behind; + } + + bool preserve_deletes() const override { + return compaction_->immutable_options()->preserve_deletes; + } + + bool enable_blob_garbage_collection() const override { + return compaction_->mutable_cf_options()->enable_blob_garbage_collection; } - virtual bool preserve_deletes() const { - return compaction_->immutable_cf_options()->preserve_deletes; + + double blob_garbage_collection_age_cutoff() const override { + return compaction_->mutable_cf_options() + ->blob_garbage_collection_age_cutoff; } - protected: - CompactionProxy() = default; + Version* input_version() const override { + return compaction_->input_version(); + } private: const Compaction* compaction_; }; - CompactionIterator(InternalIterator* input, const Comparator* cmp, - MergeHelper* merge_helper, SequenceNumber last_sequence, - std::vector* snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, Env* env, - bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregator* range_del_agg, - BlobFileBuilder* blob_file_builder, - bool allow_data_in_errors, - const Compaction* compaction = nullptr, - const CompactionFilter* compaction_filter = nullptr, - const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0, - const std::atomic* manual_compaction_paused = nullptr, - const std::shared_ptr info_log = nullptr); + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic* manual_compaction_paused = nullptr, + const std::atomic* manual_compaction_canceled = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr); // Constructor with custom CompactionProxy, used for tests. - CompactionIterator(InternalIterator* input, const Comparator* cmp, - MergeHelper* merge_helper, SequenceNumber last_sequence, - std::vector* snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, Env* env, - bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregator* range_del_agg, - BlobFileBuilder* blob_file_builder, - bool allow_data_in_errors, - std::unique_ptr compaction, - const CompactionFilter* compaction_filter = nullptr, - const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0, - const std::atomic* manual_compaction_paused = nullptr, - const std::shared_ptr info_log = nullptr); + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic* manual_compaction_paused = nullptr, + const std::atomic* manual_compaction_canceled = nullptr, + const std::shared_ptr info_log = nullptr, + const std::string* full_history_ts_low = nullptr); ~CompactionIterator(); @@ -116,16 +207,36 @@ class CompactionIterator { bool Valid() const { return valid_; } const Slice& user_key() const { return current_user_key_; } const CompactionIterationStats& iter_stats() const { return iter_stats_; } + uint64_t num_input_entry_scanned() const { return input_.num_itered(); } private: // Processes the input stream to find the next output void NextFromInput(); - // Do last preparations before presenting the output to the callee. At this - // point this only zeroes out the sequence number if possible for better - // compression. + // Do final preparations before presenting the output to the callee. void PrepareOutput(); + // Passes the output value to the blob file builder (if any), and replaces it + // with the corresponding blob reference if it has been actually written to a + // blob file (i.e. if it passed the value size check). Returns true if the + // value got extracted to a blob file, false otherwise. + bool ExtractLargeValueIfNeededImpl(); + + // Extracts large values as described above, and updates the internal key's + // type to kTypeBlobIndex if the value got extracted. Should only be called + // for regular values (kTypeValue). + void ExtractLargeValueIfNeeded(); + + // Relocates valid blobs residing in the oldest blob files if garbage + // collection is enabled. Relocated blobs are written to new blob files or + // inlined in the LSM tree depending on the current settings (i.e. + // enable_blob_files and min_blob_size). Should only be called for blob + // references (kTypeBlobIndex). + // + // Note: the stacked BlobDB implementation's compaction filter based GC + // algorithm is also called from here. + void GarbageCollectBlobIfNeeded(); + // Invoke compaction filter if needed. // Return true on success, false on failures (e.g.: kIOError). bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until); @@ -152,7 +263,24 @@ class CompactionIterator { bool IsInEarliestSnapshot(SequenceNumber sequence); - InternalIterator* input_; + // Extract user-defined timestamp from user key if possible and compare it + // with *full_history_ts_low_ if applicable. + inline void UpdateTimestampAndCompareWithFullHistoryLow() { + if (!timestamp_size_) { + return; + } + Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_); + curr_ts_.assign(ts.data(), ts.size()); + if (full_history_ts_low_) { + cmp_with_history_ts_low_ = + cmp_->CompareTimestamp(ts, *full_history_ts_low_); + } + } + + static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber( + const CompactionProxy* compaction); + + SequenceIterWrapper input_; const Comparator* cmp_; MergeHelper* merge_helper_; const std::vector* snapshots_; @@ -166,6 +294,7 @@ class CompactionIterator { const SequenceNumber earliest_write_conflict_snapshot_; const SnapshotChecker* const snapshot_checker_; Env* env_; + SystemClock* clock_; bool report_detailed_time_; bool expect_valid_internal_key_; CompactionRangeDelAggregator* range_del_agg_; @@ -174,6 +303,7 @@ class CompactionIterator { const CompactionFilter* compaction_filter_; const std::atomic* shutting_down_; const std::atomic* manual_compaction_paused_; + const std::atomic* manual_compaction_canceled_; const SequenceNumber preserve_deletes_seqnum_; bool bottommost_level_; bool valid_ = false; @@ -181,6 +311,20 @@ class CompactionIterator { SequenceNumber earliest_snapshot_; SequenceNumber latest_snapshot_; + std::shared_ptr info_log_; + + bool allow_data_in_errors_; + + // Comes from comparator. + const size_t timestamp_size_; + + // Lower bound timestamp to retain full history in terms of user-defined + // timestamp. If a key's timestamp is older than full_history_ts_low_, then + // the key *may* be eligible for garbage collection (GC). The skipping logic + // is in `NextFromInput()` and `PrepareOutput()`. + // If nullptr, NO GC will be performed and all history will be preserved. + const std::string* const full_history_ts_low_; + // State // // Points to a copy of the current compaction iterator output (current_key_) @@ -199,11 +343,13 @@ class CompactionIterator { // Stores whether ikey_.user_key is valid. If set to false, the user key is // not compared against the current key in the underlying iterator. bool has_current_user_key_ = false; - bool at_next_ = false; // If false, the iterator - // Holds a copy of the current compaction iterator output (or current key in - // the underlying iterator during NextFromInput()). + // If false, the iterator holds a copy of the current compaction iterator + // output (or current key in the underlying iterator during NextFromInput()). + bool at_next_ = false; + IterKey current_key_; Slice current_user_key_; + std::string curr_ts_; SequenceNumber current_user_key_sequence_; SequenceNumber current_user_key_snapshot_; @@ -218,7 +364,11 @@ class CompactionIterator { // PinnedIteratorsManager used to pin input_ Iterator blocks while reading // merge operands and then releasing them after consuming them. PinnedIteratorsManager pinned_iters_mgr_; + + uint64_t blob_garbage_collection_cutoff_file_number_; + std::string blob_index_; + PinnableSlice blob_value_; std::string compaction_filter_value_; InternalKey compaction_filter_skip_until_; // "level_ptrs" holds indices that remember which file of an associated @@ -233,9 +383,15 @@ class CompactionIterator { // Used to avoid purging uncommitted values. The application can specify // uncommitted values by providing a SnapshotChecker object. bool current_key_committed_; - std::shared_ptr info_log_; - bool allow_data_in_errors_; + // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_) + int cmp_with_history_ts_low_; + + const int level_; + + void AdvanceInputIter() { input_.Next(); } + + void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); } bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. @@ -244,8 +400,10 @@ class CompactionIterator { bool IsPausingManualCompaction() { // This is a best-effort facility, so memory_order_relaxed is sufficient. - return manual_compaction_paused_ && - manual_compaction_paused_->load(std::memory_order_relaxed) > 0; + return (manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed) > 0) || + (manual_compaction_canceled_ && + manual_compaction_canceled_->load(std::memory_order_relaxed)); } }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 57db424894d..fef7b541786 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -38,7 +38,7 @@ class NoMergingMergeOp : public MergeOperator { // Compaction filter that gets stuck when it sees a particular key, // then gets unstuck when told to. -// Always returns Decition::kRemove. +// Always returns Decision::kRemove. class StallingFilter : public CompactionFilter { public: Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/, @@ -156,23 +156,32 @@ class LoggingForwardVectorIterator : public InternalIterator { class FakeCompaction : public CompactionIterator::CompactionProxy { public: - FakeCompaction() = default; + int level() const override { return 0; } - int level(size_t /*compaction_input_level*/) const override { return 0; } bool KeyNotExistsBeyondOutputLevel( const Slice& /*user_key*/, std::vector* /*level_ptrs*/) const override { return is_bottommost_level || key_not_exists_beyond_output_level; } + bool bottommost_level() const override { return is_bottommost_level; } + int number_levels() const override { return 1; } + Slice GetLargestUserKey() const override { return "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; } + bool allow_ingest_behind() const override { return is_allow_ingest_behind; } bool preserve_deletes() const override { return false; } + bool enable_blob_garbage_collection() const override { return false; } + + double blob_garbage_collection_age_cutoff() const override { return 0.0; } + + Version* input_version() const override { return nullptr; } + bool key_not_exists_beyond_output_level = false; bool is_bottommost_level = false; @@ -180,7 +189,7 @@ class FakeCompaction : public CompactionIterator::CompactionProxy { bool is_allow_ingest_behind = false; }; -// A simplifed snapshot checker which assumes each snapshot has a global +// A simplified snapshot checker which assumes each snapshot has a global // last visible sequence. class TestSnapshotChecker : public SnapshotChecker { public: @@ -216,6 +225,9 @@ class CompactionIteratorTest : public testing::TestWithParam { CompactionIteratorTest() : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {} + explicit CompactionIteratorTest(const Comparator* ucmp) + : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {} + void InitIterators( const std::vector& ks, const std::vector& vs, const std::vector& range_del_ks, @@ -224,7 +236,9 @@ class CompactionIteratorTest : public testing::TestWithParam { SequenceNumber last_committed_sequence = kMaxSequenceNumber, MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, bool bottommost_level = false, - SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { std::unique_ptr unfragmented_range_del_iter( new test::VectorIterator(range_del_ks, range_del_vs)); auto tombstone_list = std::make_shared( @@ -236,10 +250,12 @@ class CompactionIteratorTest : public testing::TestWithParam { range_del_agg_->AddTombstones(std::move(range_del_iter)); std::unique_ptr compaction; - if (filter || bottommost_level) { + if (filter || bottommost_level || key_not_exists_beyond_output_level) { compaction_proxy_ = new FakeCompaction(); compaction_proxy_->is_bottommost_level = bottommost_level; compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind(); + compaction_proxy_->key_not_exists_beyond_output_level = + key_not_exists_beyond_output_level; compaction.reset(compaction_proxy_); } bool use_snapshot_checker = UseSnapshotChecker() || GetParam(); @@ -252,6 +268,11 @@ class CompactionIteratorTest : public testing::TestWithParam { 0 /*latest_snapshot*/, snapshot_checker_.get(), 0 /*level*/, nullptr /*statistics*/, &shutting_down_)); + if (c_iter_) { + // Since iter_ is still used in ~CompactionIterator(), we call + // ~CompactionIterator() first. + c_iter_.reset(); + } iter_.reset(new LoggingForwardVectorIterator(ks, vs)); iter_->SeekToFirst(); c_iter_.reset(new CompactionIterator( @@ -259,8 +280,11 @@ class CompactionIteratorTest : public testing::TestWithParam { earliest_write_conflict_snapshot, snapshot_checker_.get(), Env::Default(), false /* report_detailed_time */, false, range_del_agg_.get(), nullptr /* blob_file_builder */, - false /*allow_data_in_errors*/, std::move(compaction), filter, - &shutting_down_)); + true /*allow_data_in_errors*/, std::move(compaction), filter, + &shutting_down_, /*preserve_deletes_seqnum=*/0, + /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, /*info_log=*/nullptr, + full_history_ts_low)); } void AddSnapshot(SequenceNumber snapshot, @@ -282,10 +306,13 @@ class CompactionIteratorTest : public testing::TestWithParam { MergeOperator* merge_operator = nullptr, CompactionFilter* compaction_filter = nullptr, bool bottommost_level = false, - SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { + SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, + bool key_not_exists_beyond_output_level = false, + const std::string* full_history_ts_low = nullptr) { InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber, last_committed_seq, merge_operator, compaction_filter, - bottommost_level, earliest_write_conflict_snapshot); + bottommost_level, earliest_write_conflict_snapshot, + key_not_exists_beyond_output_level, full_history_ts_low); c_iter_->SeekToFirst(); for (size_t i = 0; i < expected_keys.size(); i++) { std::string info = "i = " + ToString(i); @@ -299,6 +326,11 @@ class CompactionIteratorTest : public testing::TestWithParam { ASSERT_FALSE(c_iter_->Valid()); } + void ClearSnapshots() { + snapshots_.clear(); + snapshot_map_.clear(); + } + const Comparator* cmp_; const InternalKeyComparator icmp_; std::vector snapshots_; @@ -680,7 +712,7 @@ TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, {"v1", "v2"}, {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)}, - {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/, + {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -689,15 +721,14 @@ TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { // permanently. TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) { AddSnapshot(1); - RunTest({test::KeyStr("a", 1, kTypeDeletion), - test::KeyStr("b", 3, kTypeDeletion), - test::KeyStr("b", 1, kTypeValue)}, - {"", "", ""}, - {test::KeyStr("b", 3, kTypeDeletion), - test::KeyStr("b", 0, kTypeValue)}, - {"", ""}, - kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, - nullptr /*compaction_filter*/, true /*bottommost_level*/); + RunTest( + {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion), + test::KeyStr("b", 1, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)}, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); } // In bottommost level, single deletions earlier than earliest snapshot can be @@ -707,7 +738,7 @@ TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) { RunTest({test::KeyStr("a", 1, kTypeSingleDeletion), test::KeyStr("b", 2, kTypeSingleDeletion)}, {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""}, - kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/, + kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -864,7 +895,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, {"v1", "v2", "v3"}, {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue), test::KeyStr("c", 3, kTypeValue)}, - {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/, + {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -875,9 +906,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, RunTest( {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion), test::KeyStr("c", 3, kTypeDeletion)}, - {"", "", ""}, - {}, - {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -885,15 +914,14 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, TEST_F(CompactionIteratorWithSnapshotCheckerTest, NotRemoveDeletionIfValuePresentToEarlierSnapshot) { AddSnapshot(2,1); - RunTest( - {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue), - test::KeyStr("b", 3, kTypeValue)}, - {"", "", ""}, - {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue), - test::KeyStr("b", 3, kTypeValue)}, - {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/, - nullptr /*merge_operator*/, nullptr /*compaction_filter*/, - true /*bottommost_level*/); + RunTest({test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, + {test::KeyStr("a", 4, kTypeDeletion), + test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, + {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/, + nullptr /*merge_operator*/, nullptr /*compaction_filter*/, + true /*bottommost_level*/); } TEST_F(CompactionIteratorWithSnapshotCheckerTest, @@ -905,7 +933,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, {"", "", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion), test::KeyStr("c", 3, kTypeSingleDeletion)}, - {"", ""}, kMaxSequenceNumber /*last_commited_seq*/, + {"", ""}, kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/, nullptr /*compaction_filter*/, true /*bottommost_level*/); } @@ -939,9 +967,24 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, 2 /*earliest_write_conflict_snapshot*/); } +// Same as above but with a blob index. In addition to the value getting +// trimmed, the type of the KV is changed to kTypeValue. +TEST_F(CompactionIteratorWithSnapshotCheckerTest, + KeepSingleDeletionForWriteConflictChecking_BlobIndex) { + AddSnapshot(2, 0); + RunTest({test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeBlobIndex)}, + {"", "fake_blob_index"}, + {test::KeyStr("a", 2, kTypeSingleDeletion), + test::KeyStr("a", 1, kTypeValue)}, + {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/, + nullptr /*compaction_filter*/, false /*bottommost_level*/, + 2 /*earliest_write_conflict_snapshot*/); +} + // Compaction filter should keep uncommitted key as-is, and -// * Convert the latest velue to deletion, and/or -// * if latest value is a merge, apply filter to all suequent merges. +// * Convert the latest value to deletion, and/or +// * if latest value is a merge, apply filter to all subsequent merges. TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) { std::unique_ptr compaction_filter( @@ -1033,6 +1076,188 @@ INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance, CompactionIteratorWithAllowIngestBehindTest, testing::Values(true, false)); +class CompactionIteratorTsGcTest : public CompactionIteratorTest { + public: + CompactionIteratorTsGcTest() + : CompactionIteratorTest(test::ComparatorWithU64Ts()) {} +}; + +TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"a3", "", "b2"}; + std::string full_history_ts_low; + // All keys' timestamps are newer than or equal to 102, thus none of them + // will be eligible for GC. + PutFixed64(&full_history_ts_low, 102); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const std::pair& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) { + constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, + kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)}; + const std::vector input_values = {"", "a2", "a1", "b5"}; + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + { + // With a snapshot at seq 3, both the deletion marker and the key at 3 must + // be preserved. + AddSnapshot(3); + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[3]}; + const std::vector expected_values = {"", "a2", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + ClearSnapshots(); + } + { + // No snapshot, the deletion marker should be preserved because the user + // key may appear beyond output level. + const std::vector expected_keys = {input_keys[0], + input_keys[3]}; + const std::vector expected_values = {"", "b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } + { + // No snapshot, the deletion marker can be dropped because the user key + // does not appear in higher levels. + const std::vector expected_keys = {input_keys[3]}; + const std::vector expected_values = {"b5"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "a1", "a0"}; + { + std::string full_history_ts_low; + // Keys whose timestamps larger than or equal to 102 will be preserved. + PutFixed64(&full_history_ts_low, 102); + const std::vector expected_keys = {input_keys[0], + input_keys[1]}; + const std::vector expected_values = {"", "a2"}; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, DropTombstones) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = {input_keys[0], input_keys[1]}; + const std::vector expected_values = {"", "a2"}; + + // Take a snapshot at seq 2. + AddSnapshot(2); + + { + // Non-bottommost level, but key does not exist beyond output level. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_sequence=*/kMaxSequenceNumber, + /*merge_op=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/false, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } + { + // Bottommost level + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low); + } +} + +TEST_P(CompactionIteratorTsGcTest, RewriteTs) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue), + test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp), + test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)}; + const std::vector input_values = {"", "a2", "", "a0"}; + const std::vector expected_keys = { + input_keys[0], input_keys[1], input_keys[2], + test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)}; + const std::vector expected_values = {"", "a2", "", "a0"}; + + AddSnapshot(1); + AddSnapshot(2); + + { + // Bottommost level and need to rewrite both ts and seq. + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 102); + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, + /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr, + /*bottommost_level=*/true, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low); + } +} + +INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance, + CompactionIteratorTsGcTest, + testing::Values(true, false)); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 91fa2dc32e4..7928273426d 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -20,7 +20,10 @@ #include #include +#include "db/blob/blob_file_addition.h" +#include "db/blob/blob_file_builder.h" #include "db/builder.h" +#include "db/compaction/clipping_iterator.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -44,6 +47,8 @@ #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" +#include "options/configurable_helper.h" +#include "options/options_helper.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -51,6 +56,7 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "rocksdb/utilities/options_type.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" @@ -126,10 +132,12 @@ struct CompactionJob::SubcompactionState { // Files produced by this subcompaction struct Output { Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, - bool _enable_order_check, bool _enable_hash) + bool _enable_order_check, bool _enable_hash, bool _finished = false, + uint64_t precalculated_hash = 0) : meta(std::move(_meta)), - validator(_icmp, _enable_order_check, _enable_hash), - finished(false) {} + validator(_icmp, _enable_order_check, _enable_hash, + precalculated_hash), + finished(_finished) {} FileMetaData meta; OutputValidator validator; bool finished; @@ -138,6 +146,7 @@ struct CompactionJob::SubcompactionState { // State kept for output being generated std::vector outputs; + std::vector blob_file_additions; std::unique_ptr outfile; std::unique_ptr builder; @@ -146,7 +155,7 @@ struct CompactionJob::SubcompactionState { // This subcompaction's output could be empty if compaction was aborted // before this subcompaction had a chance to generate any output files. // When subcompactions are executed sequentially this is more likely and - // will be particulalry likely for the later subcompactions to be empty. + // will be particularly likely for the later subcompactions to be empty. // Once they are run in parallel however it should be much rarer. return nullptr; } else { @@ -231,21 +240,13 @@ struct CompactionJob::CompactionState { std::vector sub_compact_states; Status status; - uint64_t total_bytes; - uint64_t num_output_records; - - explicit CompactionState(Compaction* c) - : compaction(c), - total_bytes(0), - num_output_records(0) {} + size_t num_output_files = 0; + uint64_t total_bytes = 0; + size_t num_blob_output_files = 0; + uint64_t total_blob_bytes = 0; + uint64_t num_output_records = 0; - size_t NumOutputFiles() { - size_t total = 0; - for (auto& s : sub_compact_states) { - total += s.outputs.size(); - } - return total; - } + explicit CompactionState(Compaction* c) : compaction(c) {} Slice SmallestUserKey() { for (const auto& sub_compact_state : sub_compact_states) { @@ -272,21 +273,40 @@ struct CompactionJob::CompactionState { }; void CompactionJob::AggregateStatistics() { + assert(compact_); + for (SubcompactionState& sc : compact_->sub_compact_states) { + auto& outputs = sc.outputs; + + if (!outputs.empty() && !outputs.back().meta.fd.file_size) { + // An error occurred, so ignore the last output. + outputs.pop_back(); + } + + compact_->num_output_files += outputs.size(); compact_->total_bytes += sc.total_bytes; + + const auto& blobs = sc.blob_file_additions; + + compact_->num_blob_output_files += blobs.size(); + + for (const auto& blob : blobs) { + compact_->total_blob_bytes += blob.GetTotalBlobBytes(); + } + compact_->num_output_records += sc.num_output_records; - } - for (SubcompactionState& sc : compact_->sub_compact_states) { + compaction_job_stats_->Add(sc.compaction_job_stats); } } CompactionJob::CompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, - const FileOptions& file_options, VersionSet* versions, - const std::atomic* shutting_down, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, - FSDirectory* db_directory, FSDirectory* output_directory, Statistics* stats, + FSDirectory* db_directory, FSDirectory* output_directory, + FSDirectory* blob_output_directory, Statistics* stats, InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, @@ -294,16 +314,24 @@ CompactionJob::CompactionJob( EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats, Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const std::atomic* manual_compaction_paused, const std::string& db_id, - const std::string& db_session_id) - : job_id_(job_id), - compact_(new CompactionState(compaction)), - compaction_job_stats_(compaction_job_stats), + const std::atomic* manual_compaction_paused, + const std::atomic* manual_compaction_canceled, + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback) + : compact_(new CompactionState(compaction)), compaction_stats_(compaction->compaction_reason(), 1), + db_options_(db_options), + mutable_db_options_copy_(mutable_db_options), + log_buffer_(log_buffer), + output_directory_(output_directory), + stats_(stats), + bottommost_level_(false), + write_hint_(Env::WLTH_NOT_SET), + job_id_(job_id), + compaction_job_stats_(compaction_job_stats), dbname_(dbname), db_id_(db_id), db_session_id_(db_session_id), - db_options_(db_options), file_options_(file_options), env_(db_options.env), io_tracer_(io_tracer), @@ -313,11 +341,10 @@ CompactionJob::CompactionJob( versions_(versions), shutting_down_(shutting_down), manual_compaction_paused_(manual_compaction_paused), + manual_compaction_canceled_(manual_compaction_canceled), preserve_deletes_seqnum_(preserve_deletes_seqnum), - log_buffer_(log_buffer), db_directory_(db_directory), - output_directory_(output_directory), - stats_(stats), + blob_output_directory_(blob_output_directory), db_mutex_(db_mutex), db_error_handler_(db_error_handler), existing_snapshots_(std::move(existing_snapshots)), @@ -325,11 +352,11 @@ CompactionJob::CompactionJob( snapshot_checker_(snapshot_checker), table_cache_(std::move(table_cache)), event_logger_(event_logger), - bottommost_level_(false), paranoid_file_checks_(paranoid_file_checks), measure_io_stats_(measure_io_stats), - write_hint_(Env::WLTH_NOT_SET), - thread_pri_(thread_pri) { + thread_pri_(thread_pri), + full_history_ts_low_(std::move(full_history_ts_low)), + blob_callback_(blob_callback) { assert(compaction_job_stats_ != nullptr); assert(log_buffer_ != nullptr); const auto* cfd = compact_->compaction->column_family_data(); @@ -391,7 +418,7 @@ void CompactionJob::Prepare() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PREPARE); - // Generate file_levels_ for compaction berfore making Iterator + // Generate file_levels_ for compaction before making Iterator auto* c = compact_->compaction; assert(c->column_family_data() != nullptr); assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( @@ -403,7 +430,7 @@ void CompactionJob::Prepare() { if (c->ShouldFormSubcompactions()) { { - StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME); + StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); GenSubcompactionBoundaries(); } assert(sizes_.size() == boundaries_.size() + 1); @@ -526,9 +553,10 @@ void CompactionJob::GenSubcompactionBoundaries() { int base_level = v->storage_info()->base_level(); uint64_t max_output_files = static_cast(std::ceil( sum / min_file_fill_percent / - MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl, - c->immutable_cf_options()->compaction_style, base_level, - c->immutable_cf_options()->level_compaction_dynamic_level_bytes))); + MaxFileSizeForLevel( + *(c->mutable_cf_options()), out_lvl, + c->immutable_options()->compaction_style, base_level, + c->immutable_options()->level_compaction_dynamic_level_bytes))); uint64_t subcompactions = std::min({static_cast(ranges.size()), static_cast(c->max_subcompactions()), @@ -569,7 +597,7 @@ Status CompactionJob::Run() { const size_t num_threads = compact_->sub_compact_states.size(); assert(num_threads > 0); - const uint64_t start_micros = env_->NowMicros(); + const uint64_t start_micros = db_options_.clock->NowMicros(); // Launch a thread for each of subcompactions 1...num_threads-1 std::vector thread_pool; @@ -588,7 +616,7 @@ Status CompactionJob::Run() { thread.join(); } - compaction_stats_.micros = env_->NowMicros() - start_micros; + compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros; compaction_stats_.cpu_micros = 0; for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) { compaction_stats_.cpu_micros += @@ -604,18 +632,34 @@ Status CompactionJob::Run() { // Check if any thread encountered an error during execution Status status; IOStatus io_s; + bool wrote_new_blob_files = false; + for (const auto& state : compact_->sub_compact_states) { if (!state.status.ok()) { status = state.status; io_s = state.io_status; break; } + + if (!state.blob_file_additions.empty()) { + wrote_new_blob_files = true; + } } + if (io_status_.ok()) { io_status_ = io_s; } - if (status.ok() && output_directory_) { - io_s = output_directory_->Fsync(IOOptions(), nullptr); + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->Fsync(IOOptions(), dbg); + } + + if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ && + blob_output_directory_ != output_directory_) { + io_s = blob_output_directory_->Fsync(IOOptions(), dbg); + } } if (io_status_.ok()) { io_status_ = io_s; @@ -711,7 +755,7 @@ Status CompactionJob::Run() { for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.outputs) { auto fn = - TableFileName(state.compaction->immutable_cf_options()->cf_paths, + TableFileName(state.compaction->immutable_options()->cf_paths, output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); tp[fn] = output.table_properties; } @@ -721,6 +765,7 @@ Status CompactionJob::Run() { // Finish up all book-keeping to unify the subcompaction results AggregateStatistics(); UpdateCompactionStats(); + RecordCompactionIOStats(); LogFlush(db_options_.info_log); TEST_SYNC_POINT("CompactionJob::Run():End"); @@ -730,11 +775,16 @@ Status CompactionJob::Run() { } Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { + assert(compact_); + AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_INSTALL); db_mutex_->AssertHeld(); Status status = compact_->status; + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + assert(cfd); + cfd->internal_stats()->AddCompactionStats( compact_->compaction->output_level(), thread_pri_, compaction_stats_); @@ -744,6 +794,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { if (!versions_->io_status().ok()) { io_status_ = versions_->io_status(); } + VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); const auto& stats = compaction_stats_; @@ -753,53 +804,78 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { double bytes_read_per_sec = 0; double bytes_written_per_sec = 0; - if (stats.bytes_read_non_output_levels > 0) { - read_write_amp = (stats.bytes_written + stats.bytes_read_output_level + - stats.bytes_read_non_output_levels) / - static_cast(stats.bytes_read_non_output_levels); - write_amp = stats.bytes_written / - static_cast(stats.bytes_read_non_output_levels); + const uint64_t bytes_read_non_output_and_blob = + stats.bytes_read_non_output_levels + stats.bytes_read_blob; + const uint64_t bytes_read_all = + stats.bytes_read_output_level + bytes_read_non_output_and_blob; + const uint64_t bytes_written_all = + stats.bytes_written + stats.bytes_written_blob; + + if (bytes_read_non_output_and_blob > 0) { + read_write_amp = (bytes_written_all + bytes_read_all) / + static_cast(bytes_read_non_output_and_blob); + write_amp = + bytes_written_all / static_cast(bytes_read_non_output_and_blob); } if (stats.micros > 0) { - bytes_read_per_sec = - (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) / - static_cast(stats.micros); + bytes_read_per_sec = bytes_read_all / static_cast(stats.micros); bytes_written_per_sec = - stats.bytes_written / static_cast(stats.micros); + bytes_written_all / static_cast(stats.micros); } + const std::string& column_family_name = cfd->GetName(); + + constexpr double kMB = 1048576.0; + ROCKS_LOG_BUFFER( log_buffer_, "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " - "files in(%d, %d) out(%d) " - "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " - "write-amplify(%.1f) %s, records in: %" PRIu64 + "files in(%d, %d) out(%d +%d blob) " + "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), " + "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64 ", records dropped: %" PRIu64 " output_compression: %s\n", - cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec, - bytes_written_per_sec, compact_->compaction->output_level(), + column_family_name.c_str(), vstorage->LevelSummary(&tmp), + bytes_read_per_sec, bytes_written_per_sec, + compact_->compaction->output_level(), stats.num_input_files_in_non_output_levels, stats.num_input_files_in_output_level, stats.num_output_files, - stats.bytes_read_non_output_levels / 1048576.0, - stats.bytes_read_output_level / 1048576.0, - stats.bytes_written / 1048576.0, read_write_amp, write_amp, - status.ToString().c_str(), stats.num_input_records, + stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB, + stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB, + stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp, + write_amp, status.ToString().c_str(), stats.num_input_records, stats.num_dropped_records, CompressionTypeToString(compact_->compaction->output_compression()) .c_str()); + const auto& blob_files = vstorage->GetBlobFiles(); + if (!blob_files.empty()) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 + "\n", + column_family_name.c_str(), blob_files.begin()->first, + blob_files.rbegin()->first); + } + UpdateCompactionJobStats(stats); auto stream = event_logger_->LogToBuffer(log_buffer_); stream << "job" << job_id_ << "event" << "compaction_finished" - << "compaction_time_micros" << compaction_stats_.micros - << "compaction_time_cpu_micros" << compaction_stats_.cpu_micros - << "output_level" << compact_->compaction->output_level() - << "num_output_files" << compact_->NumOutputFiles() - << "total_output_size" << compact_->total_bytes << "num_input_records" - << compaction_stats_.num_input_records << "num_output_records" - << compact_->num_output_records << "num_subcompactions" - << compact_->sub_compact_states.size() << "output_compression" + << "compaction_time_micros" << stats.micros + << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level" + << compact_->compaction->output_level() << "num_output_files" + << compact_->num_output_files << "total_output_size" + << compact_->total_bytes; + + if (compact_->num_blob_output_files > 0) { + stream << "num_blob_output_files" << compact_->num_blob_output_files + << "total_blob_output_size" << compact_->total_blob_bytes; + } + + stream << "num_input_records" << stats.num_input_records + << "num_output_records" << compact_->num_output_records + << "num_subcompactions" << compact_->sub_compact_states.size() + << "output_compression" << CompressionTypeToString(compact_->compaction->output_compression()); stream << "num_single_delete_mismatches" @@ -823,14 +899,173 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { } stream.EndArray(); + if (!blob_files.empty()) { + stream << "blob_file_head" << blob_files.begin()->first; + stream << "blob_file_tail" << blob_files.rbegin()->first; + } + CleanupCompaction(); return status; } +#ifndef ROCKSDB_LITE +void CompactionJob::ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact) { + assert(sub_compact); + assert(sub_compact->compaction); + assert(db_options_.compaction_service); + + const Compaction* compaction = sub_compact->compaction; + CompactionServiceInput compaction_input; + compaction_input.output_level = compaction->output_level(); + + const std::vector& inputs = + *(compact_->compaction->inputs()); + for (const auto& files_per_level : inputs) { + for (const auto& file : files_per_level.files) { + compaction_input.input_files.emplace_back( + MakeTableFileName(file->fd.GetNumber())); + } + } + compaction_input.column_family.name = + compaction->column_family_data()->GetName(); + compaction_input.column_family.options = + compaction->column_family_data()->GetLatestCFOptions(); + compaction_input.db_options = + BuildDBOptions(db_options_, mutable_db_options_copy_); + compaction_input.snapshots = existing_snapshots_; + compaction_input.has_begin = sub_compact->start; + compaction_input.begin = + compaction_input.has_begin ? sub_compact->start->ToString() : ""; + compaction_input.has_end = sub_compact->end; + compaction_input.end = + compaction_input.has_end ? sub_compact->end->ToString() : ""; + compaction_input.approx_size = sub_compact->approx_size; + + std::string compaction_input_binary; + Status s = compaction_input.Write(&compaction_input_binary); + if (!s.ok()) { + sub_compact->status = s; + return; + } + + std::ostringstream input_files_oss; + bool is_first_one = true; + for (const auto& file : compaction_input.input_files) { + input_files_oss << (is_first_one ? "" : ", ") << file; + is_first_one = false; + } + + ROCKS_LOG_INFO( + db_options_.info_log, + "[%s] [JOB %d] Starting remote compaction (output level: %d): %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_input.output_level, input_files_oss.str().c_str()); + CompactionServiceJobStatus compaction_status = + db_options_.compaction_service->Start(compaction_input_binary, job_id_); + if (compaction_status != CompactionServiceJobStatus::kSuccess) { + sub_compact->status = + Status::Incomplete("CompactionService failed to start compaction job."); + return; + } + + std::string compaction_result_binary; + compaction_status = db_options_.compaction_service->WaitForComplete( + job_id_, &compaction_result_binary); + + CompactionServiceResult compaction_result; + s = CompactionServiceResult::Read(compaction_result_binary, + &compaction_result); + if (compaction_status != CompactionServiceJobStatus::kSuccess) { + sub_compact->status = + s.ok() ? compaction_result.status + : Status::Incomplete( + "CompactionService failed to run compaction job."); + compaction_result.status.PermitUncheckedError(); + ROCKS_LOG_WARN(db_options_.info_log, + "[%s] [JOB %d] Remote compaction failed, status: %s", + compaction_input.column_family.name.c_str(), job_id_, + s.ToString().c_str()); + return; + } + + if (!s.ok()) { + sub_compact->status = s; + compaction_result.status.PermitUncheckedError(); + return; + } + sub_compact->status = compaction_result.status; + + std::ostringstream output_files_oss; + is_first_one = true; + for (const auto& file : compaction_result.output_files) { + output_files_oss << (is_first_one ? "" : ", ") << file.file_name; + is_first_one = false; + } + + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Receive remote compaction result, output path: " + "%s, files: %s", + compaction_input.column_family.name.c_str(), job_id_, + compaction_result.output_path.c_str(), + output_files_oss.str().c_str()); + + if (!s.ok()) { + sub_compact->status = s; + return; + } + + for (const auto& file : compaction_result.output_files) { + uint64_t file_num = versions_->NewFileNumber(); + auto src_file = compaction_result.output_path + "/" + file.file_name; + auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths, + file_num, compaction->output_path_id()); + s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); + if (!s.ok()) { + sub_compact->status = s; + return; + } + + FileMetaData meta; + uint64_t file_size; + s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); + if (!s.ok()) { + sub_compact->status = s; + return; + } + meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, + file.smallest_seqno, file.largest_seqno); + meta.smallest.DecodeFrom(file.smallest_internal_key); + meta.largest.DecodeFrom(file.largest_internal_key); + meta.oldest_ancester_time = file.oldest_ancester_time; + meta.file_creation_time = file.file_creation_time; + meta.marked_for_compaction = file.marked_for_compaction; + + auto cfd = compaction->column_family_data(); + sub_compact->outputs.emplace_back(std::move(meta), + cfd->internal_comparator(), false, false, + true, file.paranoid_hash); + } + sub_compact->compaction_job_stats = compaction_result.stats; + sub_compact->num_output_records = compaction_result.num_output_records; + sub_compact->approx_size = compaction_input.approx_size; // is this used? + sub_compact->total_bytes = compaction_result.total_bytes; + IOSTATS_ADD(bytes_written, compaction_result.bytes_written); + IOSTATS_ADD(bytes_read, compaction_result.bytes_read); +} +#endif // !ROCKSDB_LITE + void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { - assert(sub_compact != nullptr); + assert(sub_compact); + assert(sub_compact->compaction); - uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000; +#ifndef ROCKSDB_LITE + if (db_options_.compaction_service) { + return ProcessKeyValueCompactionWithCompactionService(sub_compact); + } +#endif // !ROCKSDB_LITE + + uint64_t prev_cpu_micros = db_options_.clock->CPUNanos() / 1000; ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); @@ -853,6 +1088,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(), existing_snapshots_); + + const Slice* const start = sub_compact->start; + const Slice* const end = sub_compact->end; + ReadOptions read_options; read_options.verify_checksums = true; read_options.fill_cache = false; @@ -862,11 +1101,42 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // (b) CompactionFilter::Decision::kRemoveAndSkipUntil. read_options.total_order_seek = true; + // Note: if we're going to support subcompactions for user-defined timestamps, + // the timestamp part will have to be stripped from the bounds here. + assert((!start && !end) || cfd->user_comparator()->timestamp_size() == 0); + read_options.iterate_lower_bound = start; + read_options.iterate_upper_bound = end; + // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. - std::unique_ptr input( + std::unique_ptr raw_input( versions_->MakeInputIterator(read_options, sub_compact->compaction, &range_del_agg, file_options_for_read_)); + InternalIterator* input = raw_input.get(); + + IterKey start_ikey; + IterKey end_ikey; + Slice start_slice; + Slice end_slice; + + if (start) { + start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); + start_slice = start_ikey.GetInternalKey(); + } + if (end) { + end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek); + end_slice = end_ikey.GetInternalKey(); + } + + std::unique_ptr clip; + if (start || end) { + clip.reset(new ClippingIterator( + raw_input.get(), start ? &start_slice : nullptr, + end ? &end_slice : nullptr, &cfd->internal_comparator())); + input = clip.get(); + } + + input->SeekToFirst(); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PROCESS_KV); @@ -892,12 +1162,29 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } MergeHelper merge( - env_, cfd->user_comparator(), cfd->ioptions()->merge_operator, + env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(), compaction_filter, db_options_.info_log.get(), false /* internal key corruption is expected */, existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), snapshot_checker_, compact_->compaction->level(), - db_options_.statistics.get(), shutting_down_); + db_options_.stats, shutting_down_); + + const MutableCFOptions* mutable_cf_options = + sub_compact->compaction->mutable_cf_options(); + assert(mutable_cf_options); + + std::vector blob_file_paths; + + std::unique_ptr blob_file_builder( + mutable_cf_options->enable_blob_files + ? new BlobFileBuilder(versions_, fs_.get(), + sub_compact->compaction->immutable_options(), + mutable_cf_options, &file_options_, job_id_, + cfd->GetID(), cfd->GetName(), + Env::IOPriority::IO_LOW, write_hint_, + io_tracer_, blob_callback_, &blob_file_paths, + &sub_compact->blob_file_additions) + : nullptr); TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); TEST_SYNC_POINT_CALLBACK( @@ -905,26 +1192,18 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { reinterpret_cast( const_cast*>(manual_compaction_paused_))); - Slice* start = sub_compact->start; - Slice* end = sub_compact->end; - if (start != nullptr) { - IterKey start_iter; - start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); - input->Seek(start_iter.GetInternalKey()); - } else { - input->SeekToFirst(); - } - Status status; + const std::string* const full_history_ts_low = + full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_; sub_compact->c_iter.reset(new CompactionIterator( - input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), + input, cfd->user_comparator(), &merge, versions_->LastSequence(), &existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), /*expect_valid_internal_key=*/true, &range_del_agg, - /* blob_file_builder */ nullptr, db_options_.allow_data_in_errors, + blob_file_builder.get(), db_options_.allow_data_in_errors, sub_compact->compaction, compaction_filter, shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_, - db_options_.info_log)); + manual_compaction_canceled_, db_options_.info_log, full_history_ts_low)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { @@ -948,12 +1227,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { const Slice& key = c_iter->key(); const Slice& value = c_iter->value(); - // If an end key (exclusive) is specified, check if the current key is - // >= than it and exit if it is because the iterator is out of its range - if (end != nullptr && - cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) { - break; - } + assert(!end || + cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0); + if (c_iter_stats.num_input_records % kRecordStatsEvery == kRecordStatsEvery - 1) { RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); @@ -1037,6 +1313,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } } + sub_compact->compaction_job_stats.num_blobs_read = + c_iter_stats.num_blobs_read; + sub_compact->compaction_job_stats.total_blob_bytes_read = + c_iter_stats.total_blob_bytes_read; sub_compact->compaction_job_stats.num_input_deletion_records = c_iter_stats.num_input_deletion_records; sub_compact->compaction_job_stats.num_corrupt_keys = @@ -1064,8 +1344,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { status = Status::ShutdownInProgress("Database shutdown"); } if ((status.ok() || status.IsColumnFamilyDropped()) && - (manual_compaction_paused_ && - manual_compaction_paused_->load(std::memory_order_relaxed) > 0)) { + ((manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed) > 0) || + (manual_compaction_canceled_ && + manual_compaction_canceled_->load(std::memory_order_relaxed)))) { status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } if (status.ok()) { @@ -1093,8 +1375,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); } + if (blob_file_builder) { + if (status.ok()) { + status = blob_file_builder->Finish(); + } else { + blob_file_builder->Abandon(); + } + blob_file_builder.reset(); + } + sub_compact->compaction_job_stats.cpu_micros = - env_->NowCPUNanos() / 1000 - prev_cpu_micros; + db_options_.clock->CPUNanos() / 1000 - prev_cpu_micros; if (measure_io_stats_) { sub_compact->compaction_job_stats.file_write_nanos += @@ -1125,7 +1416,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { #endif // ROCKSDB_ASSERT_STATUS_CHECKED sub_compact->c_iter.reset(); - input.reset(); + clip.reset(); + raw_input.reset(); sub_compact->status = status; } @@ -1373,7 +1665,7 @@ Status CompactionJob::FinishCompactionOutputFile( // Finish and check for file errors if (s.ok()) { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + StopWatch sw(db_options_.clock, stats_, COMPACTION_OUTFILE_SYNC_MICROS); io_s = sub_compact->outfile->Sync(db_options_.use_fsync); } if (s.ok() && io_s.ok()) { @@ -1408,9 +1700,20 @@ Status CompactionJob::FinishCompactionOutputFile( // This happens when the output level is bottom level, at the same time // the sub_compact output nothing. std::string fname = - TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, + TableFileName(sub_compact->compaction->immutable_options()->cf_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); - env_->DeleteFile(fname); + + // TODO(AR) it is not clear if there are any larger implications if + // DeleteFile fails here + Status ds = env_->DeleteFile(fname); + if (!ds.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64 + " at bottom level%s", + cfd->GetName().c_str(), job_id_, output_number, + meta->marked_for_compaction ? " (need compaction)" : ""); + } // Also need to remove the file from outputs, or it will be added to the // VersionEdit. @@ -1434,9 +1737,7 @@ Status CompactionJob::FinishCompactionOutputFile( FileDescriptor output_fd; uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; if (meta != nullptr) { - fname = - TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, - meta->fd.GetNumber(), meta->fd.GetPathId()); + fname = GetTableFileName(meta->fd.GetNumber()); output_fd = meta->fd; oldest_blob_file_number = meta->oldest_blob_file_number; } else { @@ -1465,9 +1766,7 @@ Status CompactionJob::FinishCompactionOutputFile( "CompactionJob::FinishCompactionOutputFile:" "MaxAllowedSpaceReached"); InstrumentedMutexLock l(db_mutex_); - // Should handle return error? - db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); } } #endif @@ -1479,9 +1778,13 @@ Status CompactionJob::FinishCompactionOutputFile( Status CompactionJob::InstallCompactionResults( const MutableCFOptions& mutable_cf_options) { + assert(compact_); + db_mutex_->AssertHeld(); auto* compaction = compact_->compaction; + assert(compaction); + // paranoia: verify that the files that we started with // still exist in the current version and in the same original level. // This ensures that a concurrent compaction did not erroneously @@ -1497,23 +1800,32 @@ Status CompactionJob::InstallCompactionResults( { Compaction::InputLevelSummaryBuffer inputs_summary; - ROCKS_LOG_INFO( - db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", - compaction->column_family_data()->GetName().c_str(), job_id_, - compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes); + ROCKS_LOG_INFO(db_options_.info_log, + "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes", + compaction->column_family_data()->GetName().c_str(), job_id_, + compaction->InputLevelSummary(&inputs_summary), + compact_->total_bytes + compact_->total_blob_bytes); } + VersionEdit* const edit = compaction->edit(); + assert(edit); + // Add compaction inputs - compaction->AddInputDeletions(compact_->compaction->edit()); + compaction->AddInputDeletions(edit); for (const auto& sub_compact : compact_->sub_compact_states) { for (const auto& out : sub_compact.outputs) { - compaction->edit()->AddFile(compaction->output_level(), out.meta); + edit->AddFile(compaction->output_level(), out.meta); + } + + for (const auto& blob : sub_compact.blob_file_additions) { + edit->AddBlobFile(blob); } } + return versions_->LogAndApply(compaction->column_family_data(), - mutable_cf_options, compaction->edit(), - db_mutex_, db_directory_); + mutable_cf_options, edit, db_mutex_, + db_directory_); } void CompactionJob::RecordCompactionIOStats() { @@ -1545,9 +1857,7 @@ Status CompactionJob::OpenCompactionOutputFile( assert(sub_compact->builder == nullptr); // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); - std::string fname = - TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, - file_number, sub_compact->compaction->output_path_id()); + std::string fname = GetTableFileName(file_number); // Fire events. ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); #ifndef ROCKSDB_LITE @@ -1562,9 +1872,17 @@ Status CompactionJob::OpenCompactionOutputFile( TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile", &syncpoint_arg); #endif + + // Pass temperature of botommost files to FileSystem. + FileOptions fo_copy = file_options_; + Temperature temperature = Temperature::kUnknown; + if (bottommost_level_) { + fo_copy.temperature = temperature = + sub_compact->compaction->mutable_cf_options()->bottommost_temperature; + } + Status s; - IOStatus io_s = - NewWritableFile(fs_.get(), fname, &writable_file, file_options_); + IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); s = io_s; if (sub_compact->io_status.ok()) { sub_compact->io_status = io_s; @@ -1590,7 +1908,7 @@ Status CompactionJob::OpenCompactionOutputFile( // Try to figure out the output file's oldest ancester time. int64_t temp_current_time = 0; - auto get_time_status = env_->GetCurrentTime(&temp_current_time); + auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time); // Safe to proceed even if GetCurrentTime fails. So, log and proceed. if (!get_time_status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, @@ -1611,6 +1929,7 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->compaction->output_path_id(), 0); meta.oldest_ancester_time = oldest_ancester_time; meta.file_creation_time = current_time; + meta.temperature = temperature; sub_compact->outputs.emplace_back( std::move(meta), cfd->internal_comparator(), /*enable_order_check=*/ @@ -1621,32 +1940,29 @@ Status CompactionJob::OpenCompactionOutputFile( writable_file->SetIOPriority(Env::IOPriority::IO_LOW); writable_file->SetWriteLifeTimeHint(write_hint_); + FileTypeSet tmp_set = db_options_.checksum_handoff_file_types; writable_file->SetPreallocationBlockSize(static_cast( sub_compact->compaction->OutputFilePreallocationSize())); const auto& listeners = - sub_compact->compaction->immutable_cf_options()->listeners; + sub_compact->compaction->immutable_options()->listeners; sub_compact->outfile.reset(new WritableFileWriter( - std::move(writable_file), fname, file_options_, env_, io_tracer_, - db_options_.statistics.get(), listeners, - db_options_.file_checksum_gen_factory.get())); - - // If the Column family flag is to only optimize filters for hits, - // we can skip creating filters if this is the bottommost_level where - // data is going to be found - bool skip_filters = - cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; + std::move(writable_file), fname, file_options_, db_options_.clock, + io_tracer_, db_options_.stats, listeners, + db_options_.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile))); - sub_compact->builder.reset(NewTableBuilder( + TableBuilderOptions tboptions( *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), - cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(), sub_compact->compaction->output_compression(), - 0 /*sample_for_compression */, - sub_compact->compaction->output_compression_opts(), - sub_compact->compaction->output_level(), skip_filters, - oldest_ancester_time, 0 /* oldest_key_time */, - sub_compact->compaction->max_output_file_size(), current_time, db_id_, - db_session_id_)); + sub_compact->compaction->output_compression_opts(), cfd->GetID(), + cfd->GetName(), sub_compact->compaction->output_level(), + bottommost_level_, TableFileCreationReason::kCompaction, + oldest_ancester_time, 0 /* oldest_key_time */, current_time, db_id_, + db_session_id_, sub_compact->compaction->max_output_file_size(), + file_number); + sub_compact->builder.reset( + NewTableBuilder(tboptions, sub_compact->outfile.get())); LogFlush(db_options_.info_log); return s; } @@ -1689,6 +2005,8 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { #endif // !ROCKSDB_LITE void CompactionJob::UpdateCompactionStats() { + assert(compact_); + Compaction* compaction = compact_->compaction; compaction_stats_.num_input_files_in_non_output_levels = 0; compaction_stats_.num_input_files_in_output_level = 0; @@ -1706,27 +2024,20 @@ void CompactionJob::UpdateCompactionStats() { } } - uint64_t num_output_records = 0; - - for (const auto& sub_compact : compact_->sub_compact_states) { - size_t num_output_files = sub_compact.outputs.size(); - if (sub_compact.builder != nullptr) { - // An error occurred so ignore the last output. - assert(num_output_files > 0); - --num_output_files; - } - compaction_stats_.num_output_files += static_cast(num_output_files); + assert(compaction_job_stats_); + compaction_stats_.bytes_read_blob = + compaction_job_stats_->total_blob_bytes_read; - num_output_records += sub_compact.num_output_records; + compaction_stats_.num_output_files = + static_cast(compact_->num_output_files); + compaction_stats_.num_output_files_blob = + static_cast(compact_->num_blob_output_files); + compaction_stats_.bytes_written = compact_->total_bytes; + compaction_stats_.bytes_written_blob = compact_->total_blob_bytes; - for (const auto& out : sub_compact.outputs) { - compaction_stats_.bytes_written += out.meta.fd.file_size; - } - } - - if (compaction_stats_.num_input_records > num_output_records) { + if (compaction_stats_.num_input_records > compact_->num_output_records) { compaction_stats_.num_dropped_records = - compaction_stats_.num_input_records - num_output_records; + compaction_stats_.num_input_records - compact_->num_output_records; } } @@ -1762,10 +2073,12 @@ void CompactionJob::UpdateCompactionJobStats( // output information compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; compaction_job_stats_->num_output_records = compact_->num_output_records; compaction_job_stats_->num_output_files = stats.num_output_files; + compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; - if (compact_->NumOutputFiles() > 0U) { + if (stats.num_output_files > 0) { CopyPrefix(compact_->SmallestUserKey(), CompactionJobStats::kMaxPrefixLength, &compaction_job_stats_->smallest_output_key_prefix); @@ -1852,7 +2165,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) { uint64_t fileno = f[i]->fd.GetNumber(); uint32_t pathid = f[i]->fd.GetPathId(); files_in_one_level.files.push_back( - TableFileName(c->immutable_cf_options()->cf_paths, fileno, pathid)); + TableFileName(c->immutable_options()->cf_paths, fileno, pathid)); } param.input_files.push_back(files_in_one_level); } @@ -1868,6 +2181,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) { compact_->sub_compact_states[i].compaction_job_stats.cpu_micros; } + RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, compaction_stats_.cpu_micros); @@ -1895,7 +2209,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) { // Generate a path name where an externally compacted file can // be copied into. Do not read into block cache. destinations.push_back( - TableFileName(sub->compaction->immutable_cf_options()->cf_paths, + TableFileName(sub->compaction->immutable_options()->cf_paths, file_numbers.back(), sub->compaction->output_path_id())); ROCKS_LOG_INFO(db_options_.info_log, "Going to install file %s to %s", @@ -1948,7 +2262,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) { for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.outputs) { auto fn = - TableFileName(state.compaction->immutable_cf_options()->cf_paths, + TableFileName(state.compaction->immutable_options()->cf_paths, output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); tp[fn] = output.table_properties; } @@ -1978,7 +2292,7 @@ void CompactionJob::RetrieveResultsAndCleanup( for (const auto& sub_compact : compact_->sub_compact_states) { for (const auto& out : sub_compact.outputs) { std::string path = TableFileName( - sub_compact.compaction->immutable_cf_options()->cf_paths, + sub_compact.compaction->immutable_options()->cf_paths, out.meta.fd.GetNumber(), out.meta.fd.GetPathId()); OutputFile file; @@ -2002,4 +2316,625 @@ void CompactionJob::RetrieveResultsAndCleanup( CleanupCompaction(); } +std::string CompactionJob::GetTableFileName(uint64_t file_number) { + return TableFileName(compact_->compaction->immutable_options()->cf_paths, + file_number, compact_->compaction->output_path_id()); +} + +#ifndef ROCKSDB_LITE +std::string CompactionServiceCompactionJob::GetTableFileName( + uint64_t file_number) { + return MakeTableFileName(output_path_, file_number); +} + +CompactionServiceCompactionJob::CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, + VersionSet* versions, const std::atomic* shutting_down, + LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id, + const std::string& output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result) + : CompactionJob( + job_id, compaction, db_options, mutable_db_options, file_options, + versions, shutting_down, 0, log_buffer, nullptr, output_directory, + nullptr, stats, db_mutex, db_error_handler, existing_snapshots, + kMaxSequenceNumber, nullptr, table_cache, event_logger, + compaction->mutable_cf_options()->paranoid_file_checks, + compaction->mutable_cf_options()->report_bg_io_stats, dbname, + &(compaction_service_result->stats), Env::Priority::USER, io_tracer, + nullptr, nullptr, db_id, db_session_id, + compaction->column_family_data()->GetFullHistoryTsLow()), + output_path_(output_path), + compaction_input_(compaction_service_input), + compaction_result_(compaction_service_result) {} + +Status CompactionServiceCompactionJob::Run() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_RUN); + + auto* c = compact_->compaction; + assert(c->column_family_data() != nullptr); + assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = + c->column_family_data()->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + Slice begin = compaction_input_.begin; + Slice end = compaction_input_.end; + compact_->sub_compact_states.emplace_back( + c, compaction_input_.has_begin ? &begin : nullptr, + compaction_input_.has_end ? &end : nullptr, + compaction_input_.approx_size); + + log_buffer_->FlushBufferToLog(); + LogCompaction(); + const uint64_t start_micros = db_options_.clock->NowMicros(); + // Pick the only sub-compaction we should have + assert(compact_->sub_compact_states.size() == 1); + SubcompactionState* sub_compact = compact_->sub_compact_states.data(); + + ProcessKeyValueCompaction(sub_compact); + + compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros; + compaction_stats_.cpu_micros = sub_compact->compaction_job_stats.cpu_micros; + + RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros); + RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, + compaction_stats_.cpu_micros); + + Status status = sub_compact->status; + IOStatus io_s = sub_compact->io_status; + + if (io_status_.ok()) { + io_status_ = io_s; + } + + if (status.ok()) { + constexpr IODebugContext* dbg = nullptr; + + if (output_directory_) { + io_s = output_directory_->Fsync(IOOptions(), dbg); + } + } + if (io_status_.ok()) { + io_status_ = io_s; + } + if (status.ok()) { + status = io_s; + } + if (status.ok()) { + // TODO: Add verify_table() and VerifyCompactionFileConsistency() + } + + // Finish up all book-keeping to unify the subcompaction results + AggregateStatistics(); + UpdateCompactionStats(); + + compaction_result_->bytes_written = IOSTATS(bytes_written); + compaction_result_->bytes_read = IOSTATS(bytes_read); + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + compact_->status = status; + compact_->status.PermitUncheckedError(); + + // Build compaction result + compaction_result_->output_level = compact_->compaction->output_level(); + compaction_result_->output_path = output_path_; + for (const auto& output_file : sub_compact->outputs) { + auto& meta = output_file.meta; + compaction_result_->output_files.emplace_back( + MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.smallest.Encode().ToString(), + meta.largest.Encode().ToString(), meta.oldest_ancester_time, + meta.file_creation_time, output_file.validator.GetHash(), + meta.marked_for_compaction); + } + compaction_result_->num_output_records = sub_compact->num_output_records; + compaction_result_->total_bytes = sub_compact->total_bytes; + + return status; +} + +void CompactionServiceCompactionJob::CleanupCompaction() { + CompactionJob::CleanupCompaction(); +} + +// Internal binary format for the input and result data +enum BinaryFormatVersion : uint32_t { + kOptionsString = 1, // Use string format similar to Option string format +}; + +// offset_of is used to get the offset of a class data member +// ex: offset_of(&ColumnFamilyDescriptor::options) +// This call will return the offset of options in ColumnFamilyDescriptor class +// +// This is the same as offsetof() but allow us to work with non standard-layout +// classes and structures +// refs: +// http://en.cppreference.com/w/cpp/concept/StandardLayoutType +// https://gist.github.com/graphitemaster/494f21190bb2c63c5516 +static ColumnFamilyDescriptor dummy_cfd("", ColumnFamilyOptions()); +template +int offset_of(T1 ColumnFamilyDescriptor::*member) { + return int(size_t(&(dummy_cfd.*member)) - size_t(&dummy_cfd)); +} + +static CompactionServiceInput dummy_cs_input; +template +int offset_of(T1 CompactionServiceInput::*member) { + return int(size_t(&(dummy_cs_input.*member)) - size_t(&dummy_cs_input)); +} + +static std::unordered_map cfd_type_info = { + {"name", + {offset_of(&ColumnFamilyDescriptor::name), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options", + {offset_of(&ColumnFamilyDescriptor::options), OptionType::kConfigurable, + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto cf_options = static_cast(addr); + return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(), + value, cf_options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto cf_options = static_cast(addr); + std::string result; + auto status = + GetStringFromColumnFamilyOptions(opts, *cf_options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = CFOptionsAsConfigurable(*this_one); + auto that_conf = CFOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, +}; + +static std::unordered_map cs_input_type_info = { + {"column_family", + OptionTypeInfo::Struct("column_family", &cfd_type_info, + offset_of(&CompactionServiceInput::column_family), + OptionVerificationType::kNormal, + OptionTypeFlags::kNone)}, + {"db_options", + {offset_of(&CompactionServiceInput::db_options), OptionType::kConfigurable, + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto options = static_cast(addr); + return GetDBOptionsFromString(opts, DBOptions(), value, options); + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto options = static_cast(addr); + std::string result; + auto status = GetStringFromDBOptions(opts, *options, &result); + *value = "{" + result + "}"; + return status; + }, + [](const ConfigOptions& opts, const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto this_one = static_cast(addr1); + const auto that_one = static_cast(addr2); + auto this_conf = DBOptionsAsConfigurable(*this_one); + auto that_conf = DBOptionsAsConfigurable(*that_one); + std::string mismatch_opt; + bool result = + this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt); + if (!result) { + *mismatch = name + "." + mismatch_opt; + } + return result; + }}}, + {"snapshots", OptionTypeInfo::Vector( + offset_of(&CompactionServiceInput::snapshots), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, + {"input_files", OptionTypeInfo::Vector( + offset_of(&CompactionServiceInput::input_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kEncodedString})}, + {"output_level", + {offset_of(&CompactionServiceInput::output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"has_begin", + {offset_of(&CompactionServiceInput::has_begin), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"begin", + {offset_of(&CompactionServiceInput::begin), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"has_end", + {offset_of(&CompactionServiceInput::has_end), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"end", + {offset_of(&CompactionServiceInput::end), OptionType::kEncodedString, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"approx_size", + {offset_of(&CompactionServiceInput::approx_size), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + cs_output_file_type_info = { + {"file_name", + {offsetof(struct CompactionServiceOutputFile, file_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_seqno", + {offsetof(struct CompactionServiceOutputFile, smallest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_seqno", + {offsetof(struct CompactionServiceOutputFile, largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_internal_key", + {offsetof(struct CompactionServiceOutputFile, smallest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_internal_key", + {offsetof(struct CompactionServiceOutputFile, largest_internal_key), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"oldest_ancester_time", + {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct CompactionServiceOutputFile, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"paranoid_hash", + {offsetof(struct CompactionServiceOutputFile, paranoid_hash), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"marked_for_compaction", + {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map + compaction_job_stats_type_info = { + {"elapsed_micros", + {offsetof(struct CompactionJobStats, elapsed_micros), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"cpu_micros", + {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_input_records", + {offsetof(struct CompactionJobStats, num_input_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_blobs_read", + {offsetof(struct CompactionJobStats, num_blobs_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files", + {offsetof(struct CompactionJobStats, num_input_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_files_at_output_level", + {offsetof(struct CompactionJobStats, num_input_files_at_output_level), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionJobStats, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files", + {offsetof(struct CompactionJobStats, num_output_files), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_files_blob", + {offsetof(struct CompactionJobStats, num_output_files_blob), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_full_compaction", + {offsetof(struct CompactionJobStats, is_full_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"is_manual_compaction", + {offsetof(struct CompactionJobStats, is_manual_compaction), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_bytes", + {offsetof(struct CompactionJobStats, total_input_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_blob_bytes_read", + {offsetof(struct CompactionJobStats, total_blob_bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes", + {offsetof(struct CompactionJobStats, total_output_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_output_bytes_blob", + {offsetof(struct CompactionJobStats, total_output_bytes_blob), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_records_replaced", + {offsetof(struct CompactionJobStats, num_records_replaced), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_key_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_key_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_input_raw_value_bytes", + {offsetof(struct CompactionJobStats, total_input_raw_value_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_input_deletion_records", + {offsetof(struct CompactionJobStats, num_input_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_expired_deletion_records", + {offsetof(struct CompactionJobStats, num_expired_deletion_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_corrupt_keys", + {offsetof(struct CompactionJobStats, num_corrupt_keys), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_write_nanos", + {offsetof(struct CompactionJobStats, file_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_range_sync_nanos", + {offsetof(struct CompactionJobStats, file_range_sync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_fsync_nanos", + {offsetof(struct CompactionJobStats, file_fsync_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_prepare_write_nanos", + {offsetof(struct CompactionJobStats, file_prepare_write_nanos), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"smallest_output_key_prefix", + {offsetof(struct CompactionJobStats, smallest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"largest_output_key_prefix", + {offsetof(struct CompactionJobStats, largest_output_key_prefix), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_fallthru", + {offsetof(struct CompactionJobStats, num_single_del_fallthru), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_single_del_mismatch", + {offsetof(struct CompactionJobStats, num_single_del_mismatch), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +namespace { +// this is a helper struct to serialize and deserialize class Status, because +// Status's members are not public. +struct StatusSerializationAdapter { + uint8_t code; + uint8_t subcode; + uint8_t severity; + std::string message; + + StatusSerializationAdapter() {} + explicit StatusSerializationAdapter(const Status& s) { + code = s.code(); + subcode = s.subcode(); + severity = s.severity(); + auto msg = s.getState(); + message = msg ? msg : ""; + } + + Status GetStatus() { + return Status(static_cast(code), + static_cast(subcode), + static_cast(severity), message); + } +}; +} // namespace + +static std::unordered_map + status_adapter_type_info = { + {"code", + {offsetof(struct StatusSerializationAdapter, code), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"subcode", + {offsetof(struct StatusSerializationAdapter, subcode), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"severity", + {offsetof(struct StatusSerializationAdapter, severity), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"message", + {offsetof(struct StatusSerializationAdapter, message), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; + +static std::unordered_map cs_result_type_info = { + {"status", + {offsetof(struct CompactionServiceResult, status), + OptionType::kCustomizable, OptionVerificationType::kNormal, + OptionTypeFlags::kNone, + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter; + Status s = OptionTypeInfo::ParseType( + opts, value, status_adapter_type_info, &adapter); + *status_obj = adapter.GetStatus(); + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr, std::string* value) { + const auto status_obj = static_cast(addr); + StatusSerializationAdapter adapter(*status_obj); + std::string result; + Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info, + &adapter, &result); + *value = "{" + result + "}"; + return s; + }, + [](const ConfigOptions& opts, const std::string& /*name*/, + const void* addr1, const void* addr2, std::string* mismatch) { + const auto status1 = static_cast(addr1); + const auto status2 = static_cast(addr2); + StatusSerializationAdapter adatper1(*status1); + StatusSerializationAdapter adapter2(*status2); + return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info, + &adatper1, &adapter2, mismatch); + }}}, + {"output_files", + OptionTypeInfo::Vector( + offsetof(struct CompactionServiceResult, output_files), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone))}, + {"output_level", + {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"output_path", + {offsetof(struct CompactionServiceResult, output_path), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_output_records", + {offsetof(struct CompactionServiceResult, num_output_records), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"total_bytes", + {offsetof(struct CompactionServiceResult, total_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_read", + {offsetof(struct CompactionServiceResult, bytes_read), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"bytes_written", + {offsetof(struct CompactionServiceResult, bytes_written), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"stats", OptionTypeInfo::Struct( + "stats", &compaction_job_stats_type_info, + offsetof(struct CompactionServiceResult, stats), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +}; + +Status CompactionServiceInput::Read(const std::string& data_str, + CompactionServiceInput* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceInput string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Input data version not supported: " + + ToString(format_version)); + } +} + +Status CompactionServiceInput::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output); +} + +Status CompactionServiceResult::Read(const std::string& data_str, + CompactionServiceResult* obj) { + if (data_str.size() <= sizeof(BinaryFormatVersion)) { + return Status::InvalidArgument("Invalid CompactionServiceResult string"); + } + auto format_version = DecodeFixed32(data_str.data()); + if (format_version == kOptionsString) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + cf.ignore_unknown_options = true; + return OptionTypeInfo::ParseType( + cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info, + obj); + } else { + return Status::NotSupported( + "Compaction Service Result data version not supported: " + + ToString(format_version)); + } +} + +Status CompactionServiceResult::Write(std::string* output) { + char buf[sizeof(BinaryFormatVersion)]; + EncodeFixed32(buf, kOptionsString); + output->append(buf, sizeof(BinaryFormatVersion)); + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output); +} + +#ifndef NDEBUG +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other, + mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) { + std::string mismatch; + return TEST_Equals(other, &mismatch); +} + +bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, + std::string* mismatch) { + ConfigOptions cf; + cf.invoke_prepare_options = false; + return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other, + mismatch); +} +#endif // NDEBUG +#endif // !ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 18e9d5e8bd2..581692d4df0 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -17,6 +17,7 @@ #include #include +#include "db/blob/blob_file_completion_callback.h" #include "db/column_family.h" #include "db/compaction/compaction_iterator.h" #include "db/dbformat.h" @@ -51,6 +52,7 @@ class Arena; class ErrorHandler; class MemTable; class SnapshotChecker; +class SystemClock; class TableCache; class Version; class VersionEdit; @@ -65,12 +67,13 @@ class CompactionJob { public: CompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const FileOptions& file_options, VersionSet* versions, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, FSDirectory* db_directory, FSDirectory* output_directory, - Statistics* stats, InstrumentedMutex* db_mutex, - ErrorHandler* db_error_handler, + FSDirectory* blob_output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, @@ -79,9 +82,12 @@ class CompactionJob { const std::string& dbname, CompactionJobStats* compaction_job_stats, Env::Priority thread_pri, const std::shared_ptr& io_tracer, const std::atomic* manual_compaction_paused = nullptr, - const std::string& db_id = "", const std::string& db_session_id = ""); + const std::atomic* manual_compaction_canceled = nullptr, + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", + BlobFileCompletionCallback* blob_callback = nullptr); - ~CompactionJob(); + virtual ~CompactionJob(); // no copy/move CompactionJob(CompactionJob&& job) = delete; @@ -110,11 +116,36 @@ class CompactionJob { // Return the IO status IOStatus io_status() const { return io_status_; } - private: + protected: struct SubcompactionState; + // CompactionJob state + struct CompactionState; void AggregateStatistics(); + void UpdateCompactionStats(); + void LogCompaction(); + void RecordCompactionIOStats(); + void CleanupCompaction(); + + // Call compaction filter. Then iterate through input and compact the + // kv-pairs + void ProcessKeyValueCompaction(SubcompactionState* sub_compact); + CompactionState* compact_; + InternalStats::CompactionStats compaction_stats_; + const ImmutableDBOptions& db_options_; + const MutableDBOptions mutable_db_options_copy_; + LogBuffer* log_buffer_; + FSDirectory* output_directory_; + Statistics* stats_; + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + + Env::WriteLifeTimeHint write_hint_; + + IOStatus io_status_; + + private: // Generates a histogram representing potential divisions of key ranges from // the input. It adds the starting and/or ending keys of certain input files // to the working set and then finds the approximate size of data in between @@ -122,12 +153,12 @@ class CompactionJob { // consecutive groups such that each group has a similar size. void GenSubcompactionBoundaries(); + void ProcessKeyValueCompactionWithCompactionService( + SubcompactionState* sub_compact); + // update the thread status for starting a compaction. void ReportStartedCompaction(Compaction* compaction); void AllocateCompactionOutputFileNumbers(); - // Call compaction filter. Then iterate through input and compact the - // kv-pairs - void ProcessKeyValueCompaction(SubcompactionState* sub_compact); Status FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, @@ -135,33 +166,23 @@ class CompactionJob { CompactionIterationStats* range_del_out_stats, const Slice* next_table_min_key = nullptr); Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); - void RecordCompactionIOStats(); Status OpenCompactionOutputFile(SubcompactionState* sub_compact); - void CleanupCompaction(); void UpdateCompactionJobStats( const InternalStats::CompactionStats& stats) const; void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, CompactionJobStats* compaction_job_stats = nullptr); - void UpdateCompactionStats(); void UpdateCompactionInputStatsHelper( int* num_files, uint64_t* bytes_read, int input_level); - void LogCompaction(); - int job_id_; - // CompactionJob state - struct CompactionState; - CompactionState* compact_; CompactionJobStats* compaction_job_stats_; - InternalStats::CompactionStats compaction_stats_; // DBImpl state const std::string& dbname_; const std::string db_id_; const std::string db_session_id_; - const ImmutableDBOptions& db_options_; const FileOptions file_options_; Env* env_; @@ -172,11 +193,10 @@ class CompactionJob { VersionSet* versions_; const std::atomic* shutting_down_; const std::atomic* manual_compaction_paused_; + const std::atomic* manual_compaction_canceled_; const SequenceNumber preserve_deletes_seqnum_; - LogBuffer* log_buffer_; FSDirectory* db_directory_; - FSDirectory* output_directory_; - Statistics* stats_; + FSDirectory* blob_output_directory_; InstrumentedMutex* db_mutex_; ErrorHandler* db_error_handler_; // If there were two snapshots with seq numbers s1 and @@ -196,17 +216,153 @@ class CompactionJob { EventLogger* event_logger_; - // Is this compaction creating a file in the bottom most level? - bool bottommost_level_; bool paranoid_file_checks_; bool measure_io_stats_; // Stores the Slices that designate the boundaries for each subcompaction std::vector boundaries_; // Stores the approx size of keys covered in the range of each subcompaction std::vector sizes_; - Env::WriteLifeTimeHint write_hint_; Env::Priority thread_pri_; - IOStatus io_status_; + std::string full_history_ts_low_; + BlobFileCompletionCallback* blob_callback_; + + // Get table file name in where it's outputting to, which should also be in + // `output_directory_`. + virtual std::string GetTableFileName(uint64_t file_number); +}; + +// CompactionServiceInput is used the pass compaction information between two +// db instances. It contains the information needed to do a compaction. It +// doesn't contain the LSM tree information, which is passed though MANIFEST +// file. +struct CompactionServiceInput { + ColumnFamilyDescriptor column_family; + + DBOptions db_options; + + std::vector snapshots; + + // SST files for compaction, it should already be expended to include all the + // files needed for this compaction, for both input level files and output + // level files. + std::vector input_files; + int output_level; + + // information for subcompaction + bool has_begin = false; + std::string begin; + bool has_end = false; + std::string end; + uint64_t approx_size = 0; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceInput* obj); + Status Write(std::string* output); + + // Initialize a dummy ColumnFamilyDescriptor + CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {} + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceInput* other); + bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceOutputFile is the metadata for the output SST file +struct CompactionServiceOutputFile { + std::string file_name; + SequenceNumber smallest_seqno; + SequenceNumber largest_seqno; + std::string smallest_internal_key; + std::string largest_internal_key; + uint64_t oldest_ancester_time; + uint64_t file_creation_time; + uint64_t paranoid_hash; + bool marked_for_compaction; + + CompactionServiceOutputFile() = default; + CompactionServiceOutputFile( + const std::string& name, SequenceNumber smallest, SequenceNumber largest, + std::string _smallest_internal_key, std::string _largest_internal_key, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time, + uint64_t _paranoid_hash, bool _marked_for_compaction) + : file_name(name), + smallest_seqno(smallest), + largest_seqno(largest), + smallest_internal_key(std::move(_smallest_internal_key)), + largest_internal_key(std::move(_largest_internal_key)), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time), + paranoid_hash(_paranoid_hash), + marked_for_compaction(_marked_for_compaction) {} +}; + +// CompactionServiceResult contains the compaction result from a different db +// instance, with these information, the primary db instance with write +// permission is able to install the result to the DB. +struct CompactionServiceResult { + Status status; + std::vector output_files; + int output_level; + + // location of the output files + std::string output_path; + + // some statistics about the compaction + uint64_t num_output_records; + uint64_t total_bytes; + uint64_t bytes_read; + uint64_t bytes_written; + CompactionJobStats stats; + + // serialization interface to read and write the object + static Status Read(const std::string& data_str, CompactionServiceResult* obj); + Status Write(std::string* output); + +#ifndef NDEBUG + bool TEST_Equals(CompactionServiceResult* other); + bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch); +#endif // NDEBUG +}; + +// CompactionServiceCompactionJob is an read-only compaction job, it takes +// input information from `compaction_service_input` and put result information +// in `compaction_service_result`, the SST files are generated to `output_path`. +class CompactionServiceCompactionJob : private CompactionJob { + public: + CompactionServiceCompactionJob( + int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, + const FileOptions& file_options, VersionSet* versions, + const std::atomic* shutting_down, LogBuffer* log_buffer, + FSDirectory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + std::shared_ptr table_cache, EventLogger* event_logger, + const std::string& dbname, const std::shared_ptr& io_tracer, + const std::string& db_id, const std::string& db_session_id, + const std::string& output_path, + const CompactionServiceInput& compaction_service_input, + CompactionServiceResult* compaction_service_result); + + // Run the compaction in current thread and return the result + Status Run(); + + void CleanupCompaction(); + + IOStatus io_status() const { return CompactionJob::io_status(); } + + private: + // Get table file name in output_path + std::string GetTableFileName(uint64_t file_number) override; + // Specific the compaction output path, otherwise it uses default DB path + const std::string output_path_; + + // Compaction job input + const CompactionServiceInput& compaction_input_; + + // Compaction job result + CompactionServiceResult* compaction_result_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index 9c640a591e0..325cc247e29 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -24,7 +24,6 @@ #include "db/write_batch_internal.h" #include "env/mock_env.h" #include "file/filename.h" -#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "monitoring/thread_status_util.h" @@ -298,15 +297,14 @@ class CompactionJobStatsTest : public testing::Test, return result; } - uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) { + Status Size(uint64_t* size, const Slice& start, const Slice& limit, + int cf = 0) { Range r(start, limit); - uint64_t size; if (cf == 0) { - db_->GetApproximateSizes(&r, 1, &size); + return db_->GetApproximateSizes(&r, 1, size); } else { - db_->GetApproximateSizes(handles_[1], &r, 1, &size); + return db_->GetApproximateSizes(handles_[1], &r, 1, size); } - return size; } void Compact(int cf, const Slice& start, const Slice& limit, diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 210042ca05e..7437f1249ff 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -5,6 +5,8 @@ #ifndef ROCKSDB_LITE +#include "db/compaction/compaction_job.h" + #include #include #include @@ -14,13 +16,13 @@ #include "db/blob/blob_index.h" #include "db/column_family.h" -#include "db/compaction/compaction_job.h" #include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/version_set.h" #include "file/writable_file_writer.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" @@ -67,31 +69,38 @@ void VerifyInitializationOfCompactionJobStats( } // namespace -// TODO(icanadi) Make it simpler once we mock out VersionSet -class CompactionJobTest : public testing::Test { - public: - CompactionJobTest() +class CompactionJobTestBase : public testing::Test { + protected: + CompactionJobTestBase(std::string dbname, const Comparator* ucmp, + std::function encode_u64_ts) : env_(Env::Default()), - fs_(std::make_shared(env_)), - dbname_(test::PerThreadDBPath("compaction_job_test")), + fs_(env_->GetFileSystem()), + dbname_(std::move(dbname)), + ucmp_(ucmp), db_options_(), mutable_cf_options_(cf_options_), mutable_db_options_(), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), - versions_(new VersionSet( - dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)), + versions_(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_manager_, + &write_controller_, + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ "")), shutting_down_(false), preserve_deletes_seqnum_(0), mock_table_factory_(new mock::MockTableFactory()), - error_handler_(nullptr, db_options_, &mutex_) { + error_handler_(nullptr, db_options_, &mutex_), + encode_u64_ts_(std::move(encode_u64_ts)) {} + + void SetUp() override { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); db_options_.env = env_; db_options_.fs = fs_; db_options_.db_paths.emplace_back(dbname_, std::numeric_limits::max()); + cf_options_.comparator = ucmp_; + cf_options_.table_factory = mock_table_factory_; } std::string GenerateFileName(uint64_t file_number) { @@ -102,9 +111,10 @@ class CompactionJobTest : public testing::Test { return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); } - static std::string KeyStr(const std::string& user_key, - const SequenceNumber seq_num, const ValueType t) { - return InternalKey(user_key, seq_num, t).Encode().ToString(); + std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num, + const ValueType t, uint64_t ts = 0) { + std::string user_key_with_ts = user_key + encode_u64_ts_(ts); + return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString(); } static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, @@ -144,7 +154,8 @@ class CompactionJobTest : public testing::Test { std::string skey; std::string value; std::tie(skey, value) = kv; - const Status pikStatus = ParseInternalKey(skey, &key); + const Status pik_status = + ParseInternalKey(skey, &key, true /* log_err_key */); smallest_seqno = std::min(smallest_seqno, key.sequence); largest_seqno = std::max(largest_seqno, key.sequence); @@ -162,7 +173,7 @@ class CompactionJobTest : public testing::Test { first_key = false; - if (pikStatus.ok() && key.type == kTypeBlobIndex) { + if (pik_status.ok() && key.type == kTypeBlobIndex) { BlobIndex blob_index; const Status s = blob_index.DecodeFrom(value); if (!s.ok()) { @@ -207,9 +218,9 @@ class CompactionJobTest : public testing::Test { // returns expected result after compaction mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) { stl_wrappers::KVMap expected_results; - const int kKeysPerFile = 10000; - const int kCorruptKeysPerFile = 200; - const int kMatchingKeys = kKeysPerFile / 2; + constexpr int kKeysPerFile = 10000; + constexpr int kCorruptKeysPerFile = 200; + constexpr int kMatchingKeys = kKeysPerFile / 2; SequenceNumber sequence_number = 0; auto corrupt_id = [&](int id) { @@ -238,7 +249,7 @@ class CompactionJobTest : public testing::Test { {bottommost_internal_key.Encode().ToString(), value}); } } - mock::SortKVVector(&contents); + mock::SortKVVector(&contents, ucmp_); AddMockFile(contents); } @@ -254,33 +265,29 @@ class CompactionJobTest : public testing::Test { } void NewDB() { - DestroyDB(dbname_, Options()); + EXPECT_OK(DestroyDB(dbname_, Options())); EXPECT_OK(env_->CreateDirIfMissing(dbname_)); versions_.reset( new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)); + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); compaction_job_stats_.Reset(); - SetIdentityFile(env_, dbname_); + ASSERT_OK(SetIdentityFile(env_, dbname_)); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { - DBImpl* impl = new DBImpl(DBOptions(), dbname_); - std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); - new_db.SetDBId(db_id); - } new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + std::unique_ptr file_writer; + const auto& fs = env_->GetFileSystem(); + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); + ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); { log::Writer log(std::move(file_writer), 0, false); std::string record; @@ -293,13 +300,12 @@ class CompactionJobTest : public testing::Test { ASSERT_OK(s); - std::vector column_families; - cf_options_.table_factory = mock_table_factory_; cf_options_.merge_operator = merge_op_; cf_options_.compaction_filter = compaction_filter_.get(); + std::vector column_families; column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); - EXPECT_OK(versions_->Recover(column_families, false)); + ASSERT_OK(versions_->Recover(column_families, false)); cfd_ = versions_->GetColumnFamilySet()->GetDefault(); } @@ -337,19 +343,23 @@ class CompactionJobTest : public testing::Test { EventLogger event_logger(db_options_.info_log.get()); // TODO(yiwu) add a mock snapshot checker and add test for it. SnapshotChecker* snapshot_checker = nullptr; + ASSERT_TRUE(full_history_ts_low_.empty() || + ucmp_->timestamp_size() == full_history_ts_low_.size()); CompactionJob compaction_job( - 0, &compaction, db_options_, env_options_, versions_.get(), - &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr, - nullptr, nullptr, &mutex_, &error_handler_, snapshots, + 0, &compaction, db_options_, mutable_db_options_, env_options_, + versions_.get(), &shutting_down_, preserve_deletes_seqnum_, &log_buffer, + nullptr, nullptr, nullptr, nullptr, &mutex_, &error_handler_, snapshots, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, - Env::Priority::USER, nullptr /* IOTracer */); + Env::Priority::USER, nullptr /* IOTracer */, + /*manual_compaction_paused=*/nullptr, + /*manual_compaction_canceled=*/nullptr, /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(); mutex_.Unlock(); - Status s; - s = compaction_job.Run(); + Status s = compaction_job.Run(); ASSERT_OK(s); ASSERT_OK(compaction_job.io_status()); mutex_.Lock(); @@ -379,6 +389,7 @@ class CompactionJobTest : public testing::Test { Env* env_; std::shared_ptr fs_; std::string dbname_; + const Comparator* const ucmp_; EnvOptions env_options_; ImmutableDBOptions db_options_; ColumnFamilyOptions cf_options_; @@ -397,6 +408,17 @@ class CompactionJobTest : public testing::Test { std::unique_ptr compaction_filter_; std::shared_ptr merge_op_; ErrorHandler error_handler_; + std::string full_history_ts_low_; + const std::function encode_u64_ts_; +}; + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest : public CompactionJobTestBase { + public: + CompactionJobTest() + : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_test"), + BytewiseComparator(), + [](uint64_t /*ts*/) { return ""; }) {} }; TEST_F(CompactionJobTest, Simple) { @@ -1077,6 +1099,297 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) { /* expected_oldest_blob_file_number */ 19); } +TEST_F(CompactionJobTest, InputSerialization) { + // Setup a random CompactionServiceInput + CompactionServiceInput input; + const int kStrMaxLen = 1000; + Random rnd(static_cast(time(nullptr))); + Random64 rnd64(time(nullptr)); + input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + input.column_family.options.comparator = ReverseBytewiseComparator(); + input.column_family.options.max_bytes_for_level_base = + rnd64.Uniform(UINT64_MAX); + input.column_family.options.disable_auto_compactions = rnd.OneIn(2); + input.column_family.options.compression = kZSTD; + input.column_family.options.compression_opts.level = 4; + input.db_options.max_background_flushes = 10; + input.db_options.paranoid_checks = rnd.OneIn(2); + input.db_options.statistics = CreateDBStatistics(); + input.db_options.env = env_; + while (!rnd.OneIn(10)) { + input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX)); + } + while (!rnd.OneIn(10)) { + input.input_files.emplace_back(rnd.RandomString(rnd.Uniform(kStrMaxLen))); + } + input.output_level = 4; + input.has_begin = rnd.OneIn(2); + if (input.has_begin) { + input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + input.has_end = rnd.OneIn(2); + if (input.has_end) { + input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + } + input.approx_size = rnd64.Uniform(UINT64_MAX); + + std::string output; + ASSERT_OK(input.Write(&output)); + + // Test deserialization + CompactionServiceInput deserialized1; + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&input)); + + // Test mismatch + deserialized1.db_options.max_background_flushes += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "db_options.max_background_flushes"); + + // Test unknown field + CompactionServiceInput deserialized2; + output.clear(); + ASSERT_OK(input.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&input)); + + // Test missing field + CompactionServiceInput deserialized3; + deserialized3.output_level = 0; + std::string to_remove = "output_level=4;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch)); + ASSERT_EQ(mismatch, "output_level"); + + // manually set the value back, should match the original structure + deserialized3.output_level = 4; + ASSERT_TRUE(deserialized3.TEST_Equals(&input)); + + // Test invalid version + output.clear(); + ASSERT_OK(input.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceInput::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); +} + +TEST_F(CompactionJobTest, ResultSerialization) { + // Setup a random CompactionServiceResult + CompactionServiceResult result; + const int kStrMaxLen = 1000; + Random rnd(static_cast(time(nullptr))); + Random64 rnd64(time(nullptr)); + std::vector status_list = { + Status::OK(), + Status::InvalidArgument("invalid option"), + Status::Aborted("failed to run"), + Status::NotSupported("not supported option"), + }; + result.status = + status_list.at(rnd.Uniform(static_cast(status_list.size()))); + while (!rnd.OneIn(10)) { + result.output_files.emplace_back( + rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), + rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), + rnd64.Uniform(UINT64_MAX), rnd.OneIn(2)); + } + result.output_level = rnd.Uniform(10); + result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); + result.num_output_records = rnd64.Uniform(UINT64_MAX); + result.total_bytes = rnd64.Uniform(UINT64_MAX); + result.bytes_read = 123; + result.bytes_written = rnd64.Uniform(UINT64_MAX); + result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX); + result.stats.num_output_files = rnd.Uniform(1000); + result.stats.is_full_compaction = rnd.OneIn(2); + result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX); + result.stats.num_input_files = 9; + + std::string output; + ASSERT_OK(result.Write(&output)); + + // Test deserialization + CompactionServiceResult deserialized1; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1)); + ASSERT_TRUE(deserialized1.TEST_Equals(&result)); + + // Test mismatch + deserialized1.stats.num_input_files += 10; + std::string mismatch; + ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "stats.num_input_files"); + + // Test unknown field + CompactionServiceResult deserialized2; + output.clear(); + ASSERT_OK(result.Write(&output)); + output.append("new_field=123;"); + + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2)); + ASSERT_TRUE(deserialized2.TEST_Equals(&result)); + + // Test missing field + CompactionServiceResult deserialized3; + deserialized3.bytes_read = 0; + std::string to_remove = "bytes_read=123;"; + size_t pos = output.find(to_remove); + ASSERT_TRUE(pos != std::string::npos); + output.erase(pos, to_remove.length()); + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3)); + mismatch.clear(); + ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "bytes_read"); + + deserialized3.bytes_read = 123; + ASSERT_TRUE(deserialized3.TEST_Equals(&result)); + + // Test invalid version + output.clear(); + ASSERT_OK(result.Write(&output)); + + uint32_t data_version = DecodeFixed32(output.data()); + const size_t kDataVersionSize = sizeof(data_version); + ASSERT_EQ(data_version, + 1U); // Update once the default data version is changed + char buf[kDataVersionSize]; + EncodeFixed32(buf, data_version + 10); // make sure it's not valid + output.replace(0, kDataVersionSize, buf, kDataVersionSize); + Status s = CompactionServiceResult::Read(output, &deserialized3); + ASSERT_TRUE(s.IsNotSupported()); + for (const auto& item : status_list) { + item.PermitUncheckedError(); + } +} + +class CompactionJobTimestampTest : public CompactionJobTestBase { + public: + CompactionJobTimestampTest() + : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"), + test::ComparatorWithU64Ts(), test::EncodeInt) {} +}; + +TEST_F(CompactionJobTimestampTest, GCDisabled) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("b", 7, ValueType::kTypeDeletionWithTimestamp, 97), ""}, + {KeyStr("c", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeValue, 95), "c5"}}); + AddMockFile(file2); + + SetLastSequence(10); + + auto expected_results = mock::MakeMockFile( + {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"}, + {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"}, + {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}, + {KeyStr("b", 7, ValueType::kTypeDeletionWithTimestamp, 97), ""}, + {KeyStr("c", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""}, + {KeyStr("c", 5, ValueType::kTypeValue, 95), "c5"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTimestampTest, NoKeyExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}}); + AddMockFile(file1); + + auto file2 = + mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + AddMockFile(file2); + + SetLastSequence(101); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"}, + {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"}, + {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}, + {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + + full_history_ts_low_ = encode_u64_ts_(0); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTimestampTest, AllKeysExpired) { + NewDB(); + + auto file1 = mock::MakeMockFile( + {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""}, + {KeyStr("b", 6, ValueType::kTypeValue, 99), "b6"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"}, + {KeyStr("b", 3, ValueType::kTypeDeletionWithTimestamp, 97), ""}, + {KeyStr("b", 2, ValueType::kTypeValue, 96), "b2"}}); + AddMockFile(file2); + + SetLastSequence(6); + + auto expected_results = + mock::MakeMockFile({{KeyStr("b", 0, ValueType::kTypeValue, 0), "b6"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + + full_history_ts_low_ = encode_u64_ts_(std::numeric_limits::max()); + RunCompaction({files}, expected_results); +} + +TEST_F(CompactionJobTimestampTest, SomeKeysExpired) { + NewDB(); + + auto file1 = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + AddMockFile(file1); + + auto file2 = mock::MakeMockFile( + {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"}, + {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"}, + {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}}); + AddMockFile(file2); + + SetLastSequence(6); + + auto expected_results = + mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"}, + {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}}); + const auto& files = cfd_->current()->storage_info()->LevelFiles(0); + + full_history_ts_low_ = encode_u64_ts_(49); + RunCompaction({files}, expected_results); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 523418c3d86..6d109213420 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -139,18 +139,16 @@ CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options, if (!enable_compression) { return cf_options.compression_opts; } - // If bottommost_compression is set and we are compacting to the - // bottommost level then we should use the specified compression options - // for the bottmomost_compression. - if (cf_options.bottommost_compression != kDisableCompressionOption && - level >= (vstorage->num_non_empty_levels() - 1) && + // If bottommost_compression_opts is enabled and we are compacting to the + // bottommost level then we should use the specified compression options. + if (level >= (vstorage->num_non_empty_levels() - 1) && cf_options.bottommost_compression_opts.enabled) { return cf_options.bottommost_compression_opts; } return cf_options.compression_opts; } -CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions, +CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : ioptions_(ioptions), icmp_(icmp) {} @@ -532,7 +530,7 @@ bool CompactionPicker::SetupOtherInputs( } } if (expand_inputs) { - ROCKS_LOG_INFO(ioptions_.info_log, + ROCKS_LOG_INFO(ioptions_.logger, "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n", @@ -672,17 +670,41 @@ Compaction* CompactionPicker::CompactRange( // two files overlap. if (input_level > 0) { const uint64_t limit = mutable_cf_options.max_compaction_bytes; - uint64_t total = 0; + uint64_t input_level_total = 0; + int hint_index = -1; + InternalKey* smallest = nullptr; + InternalKey* largest = nullptr; for (size_t i = 0; i + 1 < inputs.size(); ++i) { + if (!smallest) { + smallest = &inputs[i]->smallest; + } + largest = &inputs[i]->largest; + uint64_t s = inputs[i]->compensated_file_size; - total += s; - if (total >= limit) { + uint64_t output_level_total = 0; + if (output_level < vstorage->num_non_empty_levels()) { + std::vector files; + vstorage->GetOverlappingInputsRangeBinarySearch( + output_level, smallest, largest, &files, hint_index, &hint_index); + for (const auto& file : files) { + output_level_total += file->compensated_file_size; + } + } + + input_level_total += s; + + if (input_level_total + output_level_total >= limit) { covering_the_whole_range = false; + // still include the current file, so the compaction could be larger + // than max_compaction_bytes, which is also to make sure the compaction + // can make progress even `max_compaction_bytes` is small (e.g. smaller + // than an SST file). inputs.files.resize(i + 1); break; } } } + assert(compact_range_options.target_path_id < static_cast(ioptions_.cf_paths.size())); @@ -1006,6 +1028,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles( // any currently-existing files. for (auto file_num : *input_files) { bool found = false; + int input_file_level = -1; for (const auto& level_meta : cf_meta.levels) { for (const auto& file_meta : level_meta.files) { if (file_num == TableFileNameToNumber(file_meta.name)) { @@ -1015,6 +1038,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles( " is already being compacted."); } found = true; + input_file_level = level_meta.level; break; } } @@ -1027,6 +1051,13 @@ Status CompactionPicker::SanitizeCompactionInputFiles( "Specified compaction input file " + MakeTableFileName("", file_num) + " does not exist in column family " + cf_meta.name + "."); } + if (input_file_level > output_level) { + return Status::InvalidArgument( + "Cannot compact file to up level, input file: " + + MakeTableFileName("", file_num) + " level " + + ToString(input_file_level) + " > output level " + + ToString(output_level)); + } } return Status::OK(); @@ -1045,6 +1076,8 @@ void CompactionPicker::RegisterCompaction(Compaction* c) { level0_compactions_in_progress_.insert(c); } compactions_in_progress_.insert(c); + TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered", + c); } void CompactionPicker::UnregisterCompaction(Compaction* c) { diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 4bd431d712a..70de11f94e8 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -46,7 +46,7 @@ struct CompactionInputFiles; // compaction style specific logic for them. class CompactionPicker { public: - CompactionPicker(const ImmutableCFOptions& ioptions, + CompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp); virtual ~CompactionPicker(); @@ -218,7 +218,7 @@ class CompactionPicker { } protected: - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; // A helper function to SanitizeCompactionInputFiles() that // sanitizes "input_files" by adding necessary files. @@ -244,7 +244,7 @@ class CompactionPicker { // compaction. class NullCompactionPicker : public CompactionPicker { public: - NullCompactionPicker(const ImmutableCFOptions& ioptions, + NullCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual ~NullCompactionPicker() {} diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index 59213aec9d3..4b4c09b80f0 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -45,7 +45,7 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( uint64_t total_size = GetTotalFilesSize(level_files); int64_t _current_time; - auto status = ioptions_.env->GetCurrentTime(&_current_time); + auto status = ioptions_.clock->GetCurrentTime(&_current_time); if (!status.ok()) { ROCKS_LOG_BUFFER(log_buffer, "[%s] FIFO compaction: Couldn't get current time: %s. " @@ -244,7 +244,7 @@ Compaction* FIFOCompactionPicker::CompactRange( assert(input_level == 0); assert(output_level == 0); *compaction_end = nullptr; - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); Compaction* c = PickCompaction(cf_name, mutable_cf_options, mutable_db_options, vstorage, &log_buffer); log_buffer.FlushBufferToLog(); diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index e8aec64c6ab..2a07f8df776 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -15,7 +15,7 @@ namespace ROCKSDB_NAMESPACE { class FIFOCompactionPicker : public CompactionPicker { public: - FIFOCompactionPicker(const ImmutableCFOptions& ioptions, + FIFOCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 3778d6d8dec..08c48c8f0b5 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -49,7 +49,7 @@ class LevelCompactionBuilder { CompactionPicker* compaction_picker, LogBuffer* log_buffer, const MutableCFOptions& mutable_cf_options, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableDBOptions& mutable_db_options) : cf_name_(cf_name), vstorage_(vstorage), @@ -121,7 +121,7 @@ class LevelCompactionBuilder { CompactionReason compaction_reason_ = CompactionReason::kUnknown; const MutableCFOptions& mutable_cf_options_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const MutableDBOptions& mutable_db_options_; // Pick a path ID to place a newly generated file, with its level static uint32_t GetPathId(const ImmutableCFOptions& ioptions, diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h index 460a5ef1705..42a9b60a632 100644 --- a/db/compaction/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE { // for description of Leveled compaction. class LevelCompactionPicker : public CompactionPicker { public: - LevelCompactionPicker(const ImmutableCFOptions& ioptions, + LevelCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual Compaction* PickCompaction( diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index d5580cc8ad7..5d543048f5a 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -12,7 +12,6 @@ #include "db/compaction/compaction_picker_level.h" #include "db/compaction/compaction_picker_universal.h" -#include "logging/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -31,7 +30,7 @@ class CompactionPickerTest : public testing::Test { const Comparator* ucmp_; InternalKeyComparator icmp_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; MutableDBOptions mutable_db_options_; LevelCompactionPicker level_compaction_picker; @@ -142,7 +141,7 @@ class CompactionPickerTest : public testing::Test { if (temp_vstorage_) { VersionBuilder builder(FileOptions(), &ioptions_, nullptr, vstorage_.get(), nullptr); - builder.SaveTo(temp_vstorage_.get()); + ASSERT_OK(builder.SaveTo(temp_vstorage_.get())); vstorage_ = std::move(temp_vstorage_); } vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_); @@ -651,7 +650,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { // The case where universal periodic compaction couldn't form - // a compaction that inlcudes any file marked for periodic compaction. + // a compaction that includes any file marked for periodic compaction. // Right now we form the compaction anyway if it is more than one // sorted run. Just put the case here to validate that it doesn't // crash. @@ -801,7 +800,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { Add(2, 6U, "150", "175", 60000000U); // Overlaps with file 26, 27, total size 521M Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size - // 520M, the smalelst overlapping + // 520M, the smallest overlapping Add(2, 8U, "201", "300", 60000000U); // Overlaps with file 28, 29, total size 521M @@ -1229,7 +1228,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { Add(0, 32U, "001", "400", 1000000000U, 0, 0); Add(0, 33U, "001", "400", 1000000000U, 0, 0); - // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. Add(1, 4U, "050", "300", 1000000000U, 0, 0); file_map_[4u].first->being_compacted = true; Add(1, 5U, "301", "350", 1000000000U, 0, 0); @@ -1262,7 +1261,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { Add(0, 32U, "001", "400", 1000000000U, 0, 0); Add(0, 33U, "001", "400", 1000000000U, 0, 0); - // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1. + // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1. Add(1, 4U, "050", "300", 1000000000U, 0, 0); Add(1, 5U, "301", "350", 1000000000U, 0, 0); diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 1e95191d66b..b6f38f8282f 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -32,7 +32,7 @@ namespace { class UniversalCompactionBuilder { public: UniversalCompactionBuilder( - const ImmutableCFOptions& ioptions, const InternalKeyComparator* icmp, + const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, UniversalCompactionPicker* picker, LogBuffer* log_buffer) @@ -108,7 +108,7 @@ class UniversalCompactionBuilder { // overlapping. bool IsInputFilesNonOverlapping(Compaction* c); - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const InternalKeyComparator* icmp_; double score_; std::vector sorted_runs_; @@ -486,7 +486,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { } #endif // update statistics - RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION, + RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, c->inputs(0)->size()); picker_->RegisterCompaction(c); @@ -733,7 +733,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( } // Look at overall size amplification. If size amplification -// exceeeds the configured value, then do a compaction +// exceeds the configured value, then do a compaction // of the candidate files all the way upto the earliest // base file (overrides configured values of file-size ratios, // min_merge_width and max_merge_width). diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index a06ad3b8562..5f897cc9b39 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -15,7 +15,7 @@ namespace ROCKSDB_NAMESPACE { class UniversalCompactionPicker : public CompactionPicker { public: - UniversalCompactionPicker(const ImmutableCFOptions& ioptions, + UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} virtual Compaction* PickCompaction( diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc new file mode 100644 index 00000000000..1b9afab8910 --- /dev/null +++ b/db/compaction/compaction_service_test.cc @@ -0,0 +1,458 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class MyTestCompactionService : public CompactionService { + public: + MyTestCompactionService(const std::string& db_path, + std::shared_ptr fs, Options& options) + : db_path_(db_path), fs_(fs), options_(options) {} + + static const char* kClassName() { return "MyTestCompactionService"; } + + const char* Name() const override { return kClassName(); } + + CompactionServiceJobStatus Start(const std::string& compaction_service_input, + int job_id) override { + InstrumentedMutexLock l(&mutex_); + jobs_.emplace(job_id, compaction_service_input); + CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess; + TEST_SYNC_POINT_CALLBACK("MyTestCompactionService::Start::End", &s); + return s; + } + + CompactionServiceJobStatus WaitForComplete( + int job_id, std::string* compaction_service_result) override { + std::string compaction_input; + { + InstrumentedMutexLock l(&mutex_); + auto i = jobs_.find(job_id); + if (i == jobs_.end()) { + return CompactionServiceJobStatus::kFailure; + } + compaction_input = std::move(i->second); + jobs_.erase(i); + } + + CompactionServiceOptionsOverride options_override; + options_override.env = options_.env; + options_override.file_checksum_gen_factory = + options_.file_checksum_gen_factory; + options_override.comparator = options_.comparator; + options_override.merge_operator = options_.merge_operator; + options_override.compaction_filter = options_.compaction_filter; + options_override.compaction_filter_factory = + options_.compaction_filter_factory; + options_override.prefix_extractor = options_.prefix_extractor; + options_override.table_factory = options_.table_factory; + options_override.sst_partitioner_factory = options_.sst_partitioner_factory; + + Status s = DB::OpenAndCompact( + db_path_, db_path_ + "/" + ROCKSDB_NAMESPACE::ToString(job_id), + compaction_input, compaction_service_result, options_override); + TEST_SYNC_POINT_CALLBACK("MyTestCompactionService::WaitForComplete::End", + compaction_service_result); + compaction_num_.fetch_add(1); + if (s.ok()) { + return CompactionServiceJobStatus::kSuccess; + } else { + return CompactionServiceJobStatus::kFailure; + } + } + + int GetCompactionNum() { return compaction_num_.load(); } + + private: + InstrumentedMutex mutex_; + std::atomic_int compaction_num_{0}; + std::map jobs_; + const std::string db_path_; + std::shared_ptr fs_; + Options options_; +}; + +class CompactionServiceTest : public DBTestBase { + public: + explicit CompactionServiceTest() + : DBTestBase("compaction_service_test", true) {} + + protected: + void GenerateTestData() { + // Generate 20 files @ L2 + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // Generate 10 files @ L1 overlap with all 20 files @ L2 + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + ASSERT_EQ(FilesPerLevel(), "0,10,20"); + } + + void VerifyTestData() { + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + } +}; + +TEST_F(CompactionServiceTest, BasicCompactions) { + Options options = CurrentOptions(); + options.env = env_; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + + DestroyAndReopen(options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + auto my_cs = + dynamic_cast(options.compaction_service.get()); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + + // Test failed compaction + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) { + // override job status + Status* s = static_cast(status); + *s = Status::Aborted("MyTestCompactionService failed to compact!"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s; + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + s = Put(Key(key_id), "value_new" + ToString(key_id)); + if (s.IsAborted()) { + break; + } + } + if (s.IsAborted()) { + break; + } + s = Flush(); + if (s.IsAborted()) { + break; + } + s = dbfull()->TEST_WaitForCompact(); + if (s.IsAborted()) { + break; + } + } + ASSERT_TRUE(s.IsAborted()); +} + +TEST_F(CompactionServiceTest, ManualCompaction) { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + DestroyAndReopen(options); + GenerateTestData(); + + auto my_cs = + dynamic_cast(options.compaction_service.get()); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + start_str = Key(120); + start = start_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + end_str = Key(92); + end = end_str; + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, FailedToStart) { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + DestroyAndReopen(options); + GenerateTestData(); + + SyncPoint::GetInstance()->SetCallBack( + "MyTestCompactionService::Start::End", [&](void* status) { + // override job status + auto s = static_cast(status); + *s = CompactionServiceJobStatus::kFailure; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_TRUE(s.IsIncomplete()); +} + +TEST_F(CompactionServiceTest, InvalidResult) { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + DestroyAndReopen(options); + GenerateTestData(); + + SyncPoint::GetInstance()->SetCallBack( + "MyTestCompactionService::WaitForComplete::End", [&](void* result) { + // override job status + auto result_str = static_cast(result); + *result_str = "Invalid Str"; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_FALSE(s.ok()); +} + +// TODO: support sub-compaction +TEST_F(CompactionServiceTest, DISABLED_SubCompaction) { + Options options = CurrentOptions(); + options.env = env_; + options.max_subcompactions = 10; + options.target_file_size_base = 1 << 10; // 1KB + options.disable_auto_compactions = true; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + + DestroyAndReopen(options); + GenerateTestData(); + + auto cro = CompactRangeOptions(); + cro.max_subcompactions = 10; + db_->CompactRange(cro, nullptr, nullptr); +} + +class PartialDeleteCompactionFilter : public CompactionFilter { + public: + CompactionFilter::Decision FilterV2( + int /*level*/, const Slice& key, ValueType /*value_type*/, + const Slice& /*existing_value*/, std::string* /*new_value*/, + std::string* /*skip_until*/) const override { + int i = std::stoi(key.ToString().substr(3)); + if (i > 5 && i <= 105) { + return CompactionFilter::Decision::kRemove; + } + return CompactionFilter::Decision::kKeep; + } + + const char* Name() const override { return "PartialDeleteCompactionFilter"; } +}; + +TEST_F(CompactionServiceTest, CompactionFilter) { + Options options = CurrentOptions(); + options.env = env_; + auto delete_comp_filter = PartialDeleteCompactionFilter(); + options.compaction_filter = &delete_comp_filter; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + + DestroyAndReopen(options); + + for (int i = 0; i < 20; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 10 + j; + ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + int key_id = i * 20 + j * 2; + ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i > 5 && i <= 105) { + ASSERT_EQ(result, "NOT_FOUND"); + } else if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + auto my_cs = + dynamic_cast(options.compaction_service.get()); + ASSERT_GE(my_cs->GetCompactionNum(), 1); +} + +TEST_F(CompactionServiceTest, Snapshot) { + Options options = CurrentOptions(); + options.env = env_; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + + DestroyAndReopen(options); + + ASSERT_OK(Put(Key(1), "value1")); + ASSERT_OK(Put(Key(2), "value1")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(1), "value2")); + ASSERT_OK(Put(Key(3), "value2")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + auto my_cs = + dynamic_cast(options.compaction_service.get()); + ASSERT_GE(my_cs->GetCompactionNum(), 1); + ASSERT_EQ("value1", Get(Key(1), s1)); + ASSERT_EQ("value2", Get(Key(1))); + db_->ReleaseSnapshot(s1); +} + +TEST_F(CompactionServiceTest, ConcurrentCompaction) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 100; + options.env = env_; + options.compaction_service = std::make_shared( + dbname_, env_->GetFileSystem(), options); + options.max_background_jobs = 20; + + DestroyAndReopen(options); + GenerateTestData(); + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + std::vector threads; + for (const auto& file : meta.levels[1].files) { + threads.push_back(std::thread([&]() { + std::string fname = file.db_path + "/" + file.name; + ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2)); + })); + } + + for (auto& thread : threads) { + thread.join(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // verify result + for (int i = 0; i < 200; i++) { + auto result = Get(Key(i)); + if (i % 2) { + ASSERT_EQ(result, "value" + ToString(i)); + } else { + ASSERT_EQ(result, "value_new" + ToString(i)); + } + } + auto my_cs = + dynamic_cast(options.compaction_service.get()); + ASSERT_EQ(my_cs->GetCompactionNum(), 10); + ASSERT_EQ(FilesPerLevel(), "0,0,10"); +} + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n"); + return 0; +} + +#endif // ROCKSDB_LITE diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index 4eac91e2a57..876cf07fae7 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -17,7 +17,6 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" -using std::unique_ptr; namespace ROCKSDB_NAMESPACE { namespace { diff --git a/db/convenience.cc b/db/convenience.cc index 96735d7e5f5..5af6515c8c4 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -44,7 +44,7 @@ Status VerifySstFileChecksum(const Options& options, std::unique_ptr file; uint64_t file_size; InternalKeyComparator internal_comparator(options.comparator); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); Status s = ioptions.fs->NewRandomAccessFile(file_path, FileOptions(env_options), diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 964ff11700c..cabf7e700e5 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -9,7 +9,6 @@ #ifndef ROCKSDB_LITE -#include #include #include #include @@ -20,8 +19,8 @@ #include "db/db_test_util.h" #include "db/log_format.h" #include "db/version_set.h" -#include "env/composite_env_wrapper.h" #include "file/filename.h" +#include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" @@ -33,6 +32,7 @@ #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" #include "util/random.h" #include "util/string_util.h" @@ -42,7 +42,8 @@ static constexpr int kValueSize = 1000; class CorruptionTest : public testing::Test { public: - test::ErrorEnv env_; + std::shared_ptr env_guard_; + test::ErrorEnv* env_; std::string dbname_; std::shared_ptr tiny_cache_; Options options_; @@ -53,9 +54,14 @@ class CorruptionTest : public testing::Test { // set it to 0), test SequenceNumberRecovery will fail, likely because of a // bug in recovery code. Keep it 4 for now to make the test passes. tiny_cache_ = NewLRUCache(100, 4); + Env* base_env = Env::Default(); + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); + EXPECT_NE(base_env, nullptr); + env_ = new test::ErrorEnv(base_env); options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; - options_.env = &env_; - dbname_ = test::PerThreadDBPath("corruption_test"); + options_.env = env_; + dbname_ = test::PerThreadDBPath(env_, "corruption_test"); Status s = DestroyDB(dbname_, options_); EXPECT_OK(s); @@ -77,8 +83,11 @@ class CorruptionTest : public testing::Test { if (getenv("KEEP_DB")) { fprintf(stdout, "db is still at %s\n", dbname_.c_str()); } else { - EXPECT_OK(DestroyDB(dbname_, Options())); + Options opts; + opts.env = env_->target(); + EXPECT_OK(DestroyDB(dbname_, opts)); } + delete env_; } void CloseDb() { @@ -93,7 +102,7 @@ class CorruptionTest : public testing::Test { if (opt.env == Options().env) { // If env is not overridden, replace it with ErrorEnv. // Otherwise, the test already uses a non-default Env. - opt.env = &env_; + opt.env = env_; } opt.arena_block_size = 4096; BlockBasedTableOptions table_options; @@ -124,7 +133,7 @@ class CorruptionTest : public testing::Test { //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); Slice key = Key(i + start, &key_space); batch.Clear(); - ASSERT_OK(batch.Put(key, Value(i, &value_space))); + ASSERT_OK(batch.Put(key, Value(i + start, &value_space))); ASSERT_OK(db_->Write(WriteOptions(), &batch)); } } @@ -176,7 +185,7 @@ class CorruptionTest : public testing::Test { void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { // Pick file to corrupt std::vector filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); uint64_t number; FileType type; std::string fname; @@ -191,7 +200,7 @@ class CorruptionTest : public testing::Test { } ASSERT_TRUE(!fname.empty()) << filetype; - test::CorruptFile(fname, offset, bytes_to_corrupt); + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); } // corrupts exactly one file at level `level`. if no file found at level, @@ -201,7 +210,8 @@ class CorruptionTest : public testing::Test { db_->GetLiveFilesMetaData(&metadata); for (const auto& m : metadata) { if (m.level == level) { - test::CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt); + ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset, + bytes_to_corrupt)); return; } } @@ -256,8 +266,8 @@ TEST_F(CorruptionTest, Recovery) { // is not available for WAL though. CloseDb(); #endif - Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block + Corrupt(kWalFile, 19, 1); // WriteBatch tag for first record + Corrupt(kWalFile, log::kBlockSize + 1000, 1); // Somewhere in second block ASSERT_TRUE(!TryReopen().ok()); options_.paranoid_checks = false; Reopen(&options_); @@ -267,14 +277,14 @@ TEST_F(CorruptionTest, Recovery) { } TEST_F(CorruptionTest, RecoverWriteError) { - env_.writable_file_error_ = true; + env_->writable_file_error_ = true; Status s = TryReopen(); ASSERT_TRUE(!s.ok()); } TEST_F(CorruptionTest, NewFileErrorDuringWrite) { // Do enough writing to force minor compaction - env_.writable_file_error_ = true; + env_->writable_file_error_ = true; const int num = static_cast(3 + (Options().write_buffer_size / kValueSize)); std::string value_storage; @@ -290,8 +300,8 @@ TEST_F(CorruptionTest, NewFileErrorDuringWrite) { ASSERT_TRUE(!failed || !s.ok()); } ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_.num_writable_file_errors_, 1); - env_.writable_file_error_ = false; + ASSERT_GE(env_->num_writable_file_errors_, 1); + env_->writable_file_error_ = false; Reopen(); } @@ -309,7 +319,7 @@ TEST_F(CorruptionTest, TableFile) { TEST_F(CorruptionTest, VerifyChecksumReadahead) { Options options; - SpecialEnv senv(Env::Default()); + SpecialEnv senv(env_->target()); options.env = &senv; // Disable block cache as we are going to check checksum for // the same file twice and measure number of reads. @@ -431,6 +441,7 @@ TEST_F(CorruptionTest, CorruptedDescriptor) { TEST_F(CorruptionTest, CompactionInputError) { Options options; + options.env = env_; Reopen(&options); Build(10); DBImpl* dbi = static_cast_with_check(db_); @@ -451,6 +462,7 @@ TEST_F(CorruptionTest, CompactionInputError) { TEST_F(CorruptionTest, CompactionInputErrorParanoid) { Options options; + options.env = env_; options.paranoid_checks = true; options.write_buffer_size = 131072; options.max_write_buffer_number = 2; @@ -521,28 +533,31 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) { ASSERT_EQ(static_cast(1), metadata.size()); std::string filename = dbname_ + metadata[0].name; - std::unique_ptr file; - ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions())); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file), - filename)); + FileOptions file_opts; + const auto& fs = options_.env->GetFileSystem(); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts, + &file_reader, nullptr)); uint64_t file_size; - ASSERT_OK(options_.env->GetFileSize(filename, &file_size)); + ASSERT_OK( + fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr)); BlockHandle range_del_handle; ASSERT_OK(FindMetaBlock( file_reader.get(), file_size, kBlockBasedTableMagicNumber, - ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle)); + ImmutableOptions(options_), kRangeDelBlock, &range_del_handle)); ASSERT_OK(TryReopen()); - test::CorruptFile(filename, static_cast(range_del_handle.offset()), 1); + ASSERT_OK(test::CorruptFile(env_, filename, + static_cast(range_del_handle.offset()), 1)); ASSERT_TRUE(TryReopen().IsCorruption()); } TEST_F(CorruptionTest, FileSystemStateCorrupted) { for (int iter = 0; iter < 2; ++iter) { Options options; + options.env = env_; options.paranoid_checks = true; options.create_if_missing = true; Reopen(&options); @@ -551,7 +566,7 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) { DBImpl* dbi = static_cast_with_check(db_); std::vector metadata; dbi->GetLiveFilesMetaData(&metadata); - ASSERT_GT(metadata.size(), size_t(0)); + ASSERT_GT(metadata.size(), 0); std::string filename = dbname_ + metadata[0].name; delete db_; @@ -559,15 +574,15 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) { if (iter == 0) { // corrupt file size std::unique_ptr file; - env_.NewWritableFile(filename, &file, EnvOptions()); + ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions())); ASSERT_OK(file->Append(Slice("corrupted sst"))); file.reset(); Status x = TryReopen(&options); ASSERT_TRUE(x.IsCorruption()); } else { // delete the file - ASSERT_OK(env_.DeleteFile(filename)); + ASSERT_OK(env_->DeleteFile(filename)); Status x = TryReopen(&options); - ASSERT_TRUE(x.IsPathNotFound()); + ASSERT_TRUE(x.IsCorruption()); } ASSERT_OK(DestroyDB(dbname_, options_)); @@ -581,6 +596,7 @@ static const auto& corruption_modes = { TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { Options options; + options.env = env_; options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; @@ -595,7 +611,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { options.table_factory = mock; mock->SetCorruptionMode(mode); ASSERT_OK(DB::Open(options, dbname_, &db_)); - assert(db_ != nullptr); + assert(db_ != nullptr); // suppress false clang-analyze report Build(10); s = db_->Flush(FlushOptions()); if (mode == mock::MockTableFactory::kCorruptNone) { @@ -608,6 +624,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { Options options; + options.env = env_; options.paranoid_file_checks = true; options.create_if_missing = true; options.check_flush_compaction_key_order = false; @@ -620,7 +637,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { std::make_shared(); options.table_factory = mock; ASSERT_OK(DB::Open(options, dbname_, &db_)); - assert(db_ != nullptr); + assert(db_ != nullptr); // suppress false clang-analyze report Build(100, 2); // ASSERT_OK(db_->Flush(FlushOptions())); DBImpl* dbi = static_cast_with_check(db_); @@ -635,8 +652,111 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { } } +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + std::string start, end; + assert(db_ != nullptr); // suppress false clang-analyze report + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(3, &start), Key(7, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(8, &start), Key(9, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(5, &end))); + Build(10); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); + } + db_->ReleaseSnapshot(snap); + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + Build(10, 0, 0); + std::string start, end; + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(5, &start), Key(15, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(8, &start), Key(9, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(12, &start), Key(17, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(4, &end))); + Build(10, 10, 0); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); + } + db_->ReleaseSnapshot(snap); + } +} + +TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { + Options options; + options.env = env_; + options.check_flush_compaction_key_order = false; + options.paranoid_file_checks = true; + options.create_if_missing = true; + for (bool do_flush : {true, false}) { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + assert(db_ != nullptr); // suppress false clang-analyze report + std::string start, end; + Build(10); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(3, &start), Key(7, &end))); + auto snap = db_->GetSnapshot(); + ASSERT_NE(snap, nullptr); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(6, &start), Key(8, &end))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2, &start), Key(5, &end))); + if (do_flush) { + ASSERT_OK(db_->Flush(FlushOptions())); + } else { + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); + } + db_->ReleaseSnapshot(snap); + } +} + TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { Options options; + options.env = env_; options.create_if_missing = true; options.allow_data_in_errors = true; auto mode = mock::MockTableFactory::kCorruptKey; @@ -650,7 +770,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { options.table_factory = mock; ASSERT_OK(DB::Open(options, dbname_, &db_)); - assert(db_ != nullptr); + assert(db_ != nullptr); // suppress false clang-analyze report Build(100, 2); DBImpl* dbi = static_cast_with_check(db_); @@ -662,6 +782,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { TEST_F(CorruptionTest, CompactionKeyOrderCheck) { Options options; + options.env = env_; options.paranoid_file_checks = false; options.create_if_missing = true; options.check_flush_compaction_key_order = false; @@ -672,7 +793,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { std::make_shared(); options.table_factory = mock; ASSERT_OK(DB::Open(options, dbname_, &db_)); - assert(db_ != nullptr); + assert(db_ != nullptr); // suppress false clang-analyze report mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey); Build(100, 2); DBImpl* dbi = static_cast_with_check(db_); @@ -685,6 +806,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { TEST_F(CorruptionTest, FlushKeyOrderCheck) { Options options; + options.env = env_; options.paranoid_file_checks = false; options.create_if_missing = true; ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); @@ -713,7 +835,6 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) { } TEST_F(CorruptionTest, DisableKeyOrderCheck) { - Options options; ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}})); DBImpl* dbi = static_cast_with_check(db_); @@ -732,106 +853,10 @@ TEST_F(CorruptionTest, DisableKeyOrderCheck) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { - Options options; - options.paranoid_file_checks = true; - options.create_if_missing = true; - for (bool do_flush : {true, false}) { - delete db_; - db_ = nullptr; - ASSERT_OK(DestroyDB(dbname_, options)); - ASSERT_OK(DB::Open(options, dbname_, &db_)); - std::string start, end; - assert(db_ != nullptr); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(3, &start), Key(7, &end))); - auto snap = db_->GetSnapshot(); - ASSERT_NE(snap, nullptr); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(8, &start), Key(9, &end))); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(2, &start), Key(5, &end))); - Build(10); - if (do_flush) { - ASSERT_OK(db_->Flush(FlushOptions())); - } else { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); - } - db_->ReleaseSnapshot(snap); - } -} - -TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { - Options options; - options.paranoid_file_checks = true; - options.create_if_missing = true; - for (bool do_flush : {true, false}) { - delete db_; - db_ = nullptr; - ASSERT_OK(DestroyDB(dbname_, options)); - ASSERT_OK(DB::Open(options, dbname_, &db_)); - assert(db_ != nullptr); - Build(10, 0, 0); - std::string start, end; - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(5, &start), Key(15, &end))); - auto snap = db_->GetSnapshot(); - ASSERT_NE(snap, nullptr); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(8, &start), Key(9, &end))); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(12, &start), Key(17, &end))); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(2, &start), Key(4, &end))); - Build(10, 10, 0); - if (do_flush) { - ASSERT_OK(db_->Flush(FlushOptions())); - } else { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); - } - db_->ReleaseSnapshot(snap); - } -} - -TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { - Options options; - options.paranoid_file_checks = true; - options.create_if_missing = true; - for (bool do_flush : {true, false}) { - delete db_; - db_ = nullptr; - ASSERT_OK(DestroyDB(dbname_, options)); - ASSERT_OK(DB::Open(options, dbname_, &db_)); - assert(db_ != nullptr); - std::string start, end; - Build(10); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(3, &start), Key(7, &end))); - auto snap = db_->GetSnapshot(); - ASSERT_NE(snap, nullptr); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(6, &start), Key(8, &end))); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(2, &start), Key(5, &end))); - if (do_flush) { - ASSERT_OK(db_->Flush(FlushOptions())); - } else { - DBImpl* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true)); - } - db_->ReleaseSnapshot(snap); - } -} - TEST_F(CorruptionTest, VerifyWholeTableChecksum) { CloseDb(); Options options; - options.env = &env_; + options.env = env_; ASSERT_OK(DestroyDB(dbname_, options)); options.create_if_missing = true; options.file_checksum_gen_factory = @@ -840,50 +865,43 @@ TEST_F(CorruptionTest, VerifyWholeTableChecksum) { Build(10, 5); - auto* dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->VerifyFileChecksums(ReadOptions())); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); CloseDb(); // Corrupt the first byte of each table file, this must be data block. Corrupt(kTableFile, 0, 1); ASSERT_OK(TryReopen(&options)); - dbi = static_cast_with_check(db_); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); int count{0}; SyncPoint::GetInstance()->SetCallBack( - "DBImpl::VerifySstFileChecksum:mismatch", [&](void* arg) { + "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) { auto* s = reinterpret_cast(arg); - assert(s); + ASSERT_NE(s, nullptr); ++count; ASSERT_NOK(*s); }); SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_TRUE(dbi->VerifyFileChecksums(ReadOptions()).IsCorruption()); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption()); ASSERT_EQ(1, count); - - CloseDb(); - ASSERT_OK(DestroyDB(dbname_, options)); - Reopen(&options); - Build(10, 5); - dbi = static_cast_with_check(db_); - ASSERT_OK(dbi->VerifyFileChecksums(ReadOptions())); - CloseDb(); - Corrupt(kTableFile, 0, 1); - - // Set best_efforts_recovery to true - options.best_efforts_recovery = true; -#ifdef OS_LINUX - ASSERT_TRUE(TryReopen(&options).IsCorruption()); -#endif // OS_LINUX } } // namespace ROCKSDB_NAMESPACE +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 2aaf2c50ded..9b76c03d5ca 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -63,6 +63,15 @@ class CuckooTableDBTest : public testing::Test { ASSERT_OK(DB::Open(opts, dbname_, &db_)); } + void DestroyAndReopen(Options* options) { + assert(options); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + Reopen(options); + } + Status Put(const Slice& k, const Slice& v) { return db_->Put(WriteOptions(), k, v); } @@ -120,10 +129,10 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Put("key1", "v1")); ASSERT_OK(Put("key2", "v2")); ASSERT_OK(Put("key3", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(3U, ptc.begin()->second->num_entries); ASSERT_EQ("1", FilesPerLevel()); @@ -137,9 +146,9 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Put("key4", "v4")); ASSERT_OK(Put("key5", "v5")); ASSERT_OK(Put("key6", "v6")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(2U, ptc.size()); auto row = ptc.begin(); ASSERT_EQ(3U, row->second->num_entries); @@ -155,8 +164,8 @@ TEST_F(CuckooTableDBTest, Flush) { ASSERT_OK(Delete("key6")); ASSERT_OK(Delete("key5")); ASSERT_OK(Delete("key4")); - dbfull()->TEST_FlushMemTable(); - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(3U, ptc.size()); row = ptc.begin(); ASSERT_EQ(3U, row->second->num_entries); @@ -177,10 +186,10 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) { ASSERT_OK(Put("key1", "v1")); ASSERT_OK(Put("key2", "v2")); ASSERT_OK(Put("key1", "v3")); // Duplicate - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(1U, ptc.size()); ASSERT_EQ(2U, ptc.begin()->second->num_entries); ASSERT_EQ("1", FilesPerLevel()); @@ -205,12 +214,12 @@ static std::string Uint64Key(uint64_t i) { TEST_F(CuckooTableDBTest, Uint64Comparator) { Options options = CurrentOptions(); options.comparator = test::Uint64Comparator(); - Reopen(&options); + DestroyAndReopen(&options); ASSERT_OK(Put(Uint64Key(1), "v1")); ASSERT_OK(Put(Uint64Key(2), "v2")); ASSERT_OK(Put(Uint64Key(3), "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get(Uint64Key(1))); ASSERT_EQ("v2", Get(Uint64Key(2))); @@ -219,10 +228,10 @@ TEST_F(CuckooTableDBTest, Uint64Comparator) { // Add more keys. ASSERT_OK(Delete(Uint64Key(2))); // Delete. - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. ASSERT_OK(Put(Uint64Key(4), "v4")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get(Uint64Key(1))); ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2))); ASSERT_EQ("v0", Get(Uint64Key(3))); @@ -242,11 +251,11 @@ TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) { for (int idx = 0; idx < 28; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("1", FilesPerLevel()); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow trivial move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow trivial move */)); ASSERT_EQ("0,2", FilesPerLevel()); for (int idx = 0; idx < 28; ++idx) { ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx))); @@ -265,15 +274,15 @@ TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { for (int idx = 0; idx < 11; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a'))); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("1", FilesPerLevel()); // Generate one more file in level-0, and should trigger level-0 compaction for (int idx = 0; idx < 11; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx)))); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_EQ("0,1", FilesPerLevel()); for (int idx = 0; idx < 11; ++idx) { @@ -294,7 +303,7 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) { ASSERT_OK(Put("key1", "v1")); ASSERT_OK(Put("key2", "v2")); ASSERT_OK(Put("key3", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Write some keys using plain table. std::shared_ptr block_based_factory( @@ -310,7 +319,7 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) { Reopen(&options); ASSERT_OK(Put("key4", "v4")); ASSERT_OK(Put("key1", "v5")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Write some keys using block based table. options.table_factory.reset(NewAdaptiveTableFactory( @@ -319,7 +328,7 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) { Reopen(&options); ASSERT_OK(Put("key5", "v6")); ASSERT_OK(Put("key2", "v7")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v5", Get("key1")); ASSERT_EQ("v7", Get("key2")); diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 21723c6a1be..34f27b80924 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -12,6 +12,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/flush_block_policy.h" #include "rocksdb/merge_operator.h" #include "rocksdb/perf_context.h" #include "rocksdb/utilities/debug.h" @@ -20,6 +21,7 @@ #if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" #endif +#include "util/file_checksum_helper.h" #include "util/random.h" #include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" @@ -37,7 +39,10 @@ TEST_F(DBBasicTest, OpenWhenOpen) { options.env = env_; DB* db2 = nullptr; Status s = DB::Open(options, dbname_, &db2); - ASSERT_NOK(s); + ASSERT_NOK(s) << [db2]() { + delete db2; + return "db2 open: ok"; + }(); ASSERT_EQ(Status::Code::kIOError, s.code()); ASSERT_EQ(Status::SubCode::kNone, s.subcode()); ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr); @@ -142,7 +147,7 @@ TEST_F(DBBasicTest, ReadOnlyDB) { // Reopen and flush memtable. Reopen(options); - Flush(); + ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); @@ -178,7 +183,7 @@ TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) { // Reopen and flush memtable. Reopen(options); - Flush(); + ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); @@ -201,7 +206,7 @@ TEST_F(DBBasicTest, CompactedDB) { Reopen(options); // 1 L0 file, use CompactedDB if max_open_files = -1 ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); - Flush(); + ASSERT_OK(Flush()); Close(); ASSERT_OK(ReadOnlyReopen(options)); Status s = Put("new", "value"); @@ -219,12 +224,12 @@ TEST_F(DBBasicTest, CompactedDB) { Reopen(options); // Add more L0 files ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); - Flush(); + ASSERT_OK(Flush()); Close(); ASSERT_OK(ReadOnlyReopen(options)); @@ -401,16 +406,18 @@ TEST_F(DBBasicTest, GetSnapshot) { TEST_F(DBBasicTest, CheckLock) { do { - DB* localdb; + DB* localdb = nullptr; Options options = CurrentOptions(); ASSERT_OK(TryReopen(options)); // second open should fail Status s = DB::Open(options, dbname_, &localdb); - ASSERT_NOK(s); + ASSERT_NOK(s) << [localdb]() { + delete localdb; + return "localdb open: ok"; + }(); #ifdef OS_LINUX - ASSERT_TRUE(s.ToString().find("lock hold by current process") != - std::string::npos); + ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos); #endif // OS_LINUX } while (ChangeCompactOptions()); } @@ -659,7 +666,7 @@ TEST_F(DBBasicTest, Snapshot) { ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); ASSERT_EQ(1U, GetNumSnapshots()); - ASSERT_LE(time_snap1, GetTimeOldestSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber()); db_->ReleaseSnapshot(s2); @@ -1407,10 +1414,10 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) { // mix with memtable ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k5", "v5")); ASSERT_OK(Delete(1, "no_key")); @@ -1453,19 +1460,19 @@ TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) { // mix with memtable ASSERT_OK(Merge(1, "k1", "v1")); ASSERT_OK(Merge(1, "k2", "v2")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Merge(1, "k3", "v3")); ASSERT_OK(Merge(1, "k4", "v4")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Merge(1, "k4", "v4_2")); ASSERT_OK(Merge(1, "k6", "v6")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Merge(1, "k7", "v7")); ASSERT_OK(Merge(1, "k8", "v8")); - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); get_perf_context()->Reset(); @@ -1505,12 +1512,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); @@ -1519,12 +1526,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); @@ -1533,12 +1540,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); @@ -1584,12 +1591,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); @@ -1598,12 +1605,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); @@ -1612,12 +1619,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); @@ -1699,7 +1706,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSize) { ASSERT_OK(Put(1, "k7", "v7_")); ASSERT_OK(Put(1, "k3", "v3_")); ASSERT_OK(Put(1, "k4", "v4")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k11", "v11")); ASSERT_OK(Delete(1, "no_key")); @@ -1709,7 +1716,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSize) { ASSERT_OK(Put(1, "k15", "v15")); ASSERT_OK(Put(1, "k16", "v16")); ASSERT_OK(Put(1, "k17", "v17")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k1", "v1_")); ASSERT_OK(Put(1, "k2", "v2_")); @@ -1779,12 +1786,12 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); @@ -1793,12 +1800,12 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); @@ -1807,12 +1814,12 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { - Flush(); + ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); @@ -1875,6 +1882,7 @@ TEST_F(DBBasicTest, MultiGetStats) { Options options; options.create_if_missing = true; options.disable_auto_compactions = true; + options.env = env_; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.block_size = 1; @@ -1884,7 +1892,7 @@ TEST_F(DBBasicTest, MultiGetStats) { table_options.no_block_cache = true; table_options.cache_index_and_filter_blocks = false; table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); int total_keys = 2000; @@ -1901,10 +1909,10 @@ TEST_F(DBBasicTest, MultiGetStats) { keys[i] = Slice(keys_str[i]); ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); if (i % 100 == 0) { - Flush(1); + ASSERT_OK(Flush(1)); } } - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); for (int i = 501; i < 1000; ++i) { @@ -1912,11 +1920,11 @@ TEST_F(DBBasicTest, MultiGetStats) { keys[i] = Slice(keys_str[i]); ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); if (i % 100 == 0) { - Flush(1); + ASSERT_OK(Flush(1)); } } - Flush(1); + ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); for (int i = 1001; i < total_keys; ++i) { @@ -1924,10 +1932,10 @@ TEST_F(DBBasicTest, MultiGetStats) { keys[i] = Slice(keys_str[i]); ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); if (i % 100 == 0) { - Flush(1); + ASSERT_OK(Flush(1)); } } - Flush(1); + ASSERT_OK(Flush(1)); Close(); ReopenWithColumnFamilies({"default", "pikachu"}, options); @@ -1954,7 +1962,7 @@ TEST_F(DBBasicTest, MultiGetStats) { ASSERT_GT(hist_sst.max, 0); // Minimun number of blocks read in a level. - ASSERT_EQ(hist_data_blocks.min, 0); + ASSERT_EQ(hist_data_blocks.min, 3); ASSERT_GT(hist_index_and_filter_blocks.min, 0); // Minimun number of sst files read in a level. ASSERT_GT(hist_sst.max, 0); @@ -2035,11 +2043,11 @@ TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) { ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k5", "v5")); const Snapshot* snap1 = dbfull()->GetSnapshot(); ASSERT_OK(Delete(1, "k4")); - Flush(1); + ASSERT_OK(Flush(1)); const Snapshot* snap2 = dbfull()->GetSnapshot(); get_perf_context()->Reset(); @@ -2168,7 +2176,7 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { table_options.block_size = 16 * 1024; ASSERT_TRUE(table_options.block_size > BlockBasedTable::kMultiGetReadStackBufSize); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); std::string zero_str(128, '\0'); @@ -2178,7 +2186,7 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { std::string value(rnd.RandomString(128) + zero_str); assert(Put(Key(i), value) == Status::OK()); } - Flush(); + ASSERT_OK(Flush()); std::vector key_data(10); std::vector keys; @@ -2201,8 +2209,6 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) { Options options = CurrentOptions(); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu", "eevee"}, options); size_t num_cfs = handles_.size(); @@ -2241,8 +2247,6 @@ TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) { TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) { Options options = CurrentOptions(); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); DestroyAndReopen(options); ASSERT_OK(Put("foo", "value")); ASSERT_OK(Flush()); @@ -2282,6 +2286,43 @@ class TableFileListener : public EventListener { }; } // namespace +TEST_F(DBBasicTest, LastSstFileNotInManifest) { + // If the last sst file is not tracked in MANIFEST, + // or the VersionEdit for the last sst file is not synced, + // on recovery, the last sst file should be deleted, + // and new sst files shouldn't reuse its file number. + Options options = CurrentOptions(); + DestroyAndReopen(options); + Close(); + + // Manually add a sst file. + constexpr uint64_t kSstFileNumber = 100; + const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); + ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", + /* fname = */ kSstFile, + /* should_sync = */ true)); + ASSERT_OK(env_->FileExists(kSstFile)); + + TableFileListener* listener = new TableFileListener(); + options.listeners.emplace_back(listener); + Reopen(options); + // kSstFile should already be deleted. + ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound()); + + ASSERT_OK(Put("k", "v")); + ASSERT_OK(Flush()); + // New sst file should have file number > kSstFileNumber. + std::vector& files = + listener->GetFiles(kDefaultColumnFamilyName); + ASSERT_EQ(files.size(), 1); + const std::string fname = files[0].erase(0, (dbname_ + "/").size()); + uint64_t number = 0; + FileType type = kTableFile; + ASSERT_TRUE(ParseFileName(fname, &number, &type)); + ASSERT_EQ(type, kTableFile); + ASSERT_GT(number, kSstFileNumber); +} + TEST_F(DBBasicTest, RecoverWithMissingFiles) { Options options = CurrentOptions(); DestroyAndReopen(options); @@ -2289,8 +2330,6 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) { // Disable auto compaction to simplify SST file name tracking. options.disable_auto_compactions = true; options.listeners.emplace_back(listener); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); CreateAndReopenWithCF({"pikachu", "eevee"}, options); std::vector all_cf_names = {kDefaultColumnFamilyName, "pikachu", "eevee"}; @@ -2351,8 +2390,6 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) { TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { Options options = CurrentOptions(); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); options.env = env_; DestroyAndReopen(options); ASSERT_OK(Put("foo", "value0")); @@ -2379,8 +2416,6 @@ TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { TEST_F(DBBasicTest, RecoverWithNoCurrentFile) { Options options = CurrentOptions(); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); options.env = env_; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -2404,8 +2439,6 @@ TEST_F(DBBasicTest, RecoverWithNoCurrentFile) { TEST_F(DBBasicTest, RecoverWithNoManifest) { Options options = CurrentOptions(); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); options.env = env_; DestroyAndReopen(options); ASSERT_OK(Put("foo", "value")); @@ -2417,7 +2450,7 @@ TEST_F(DBBasicTest, RecoverWithNoManifest) { ASSERT_OK(env_->GetChildren(dbname_, &files)); for (const auto& file : files) { uint64_t number = 0; - FileType type = kLogFile; + FileType type = kWalFile; if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file)); } @@ -2435,8 +2468,6 @@ TEST_F(DBBasicTest, RecoverWithNoManifest) { TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) { Options options = CurrentOptions(); - options.file_checksum_gen_factory = - ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory(); DestroyAndReopen(options); TableFileListener* listener = new TableFileListener(); options.listeners.emplace_back(listener); @@ -2475,6 +2506,42 @@ TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) { ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); } + +TEST_F(DBBasicTest, DisableTrackWal) { + // If WAL tracking was enabled, and then disabled during reopen, + // the previously tracked WALs should be removed from MANIFEST. + + Options options = CurrentOptions(); + options.track_and_verify_wals_in_manifest = true; + // extremely small write buffer size, + // so that new WALs are created more frequently. + options.write_buffer_size = 100; + options.env = env_; + DestroyAndReopen(options); + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i))); + } + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(db_->SyncWAL()); + // Some WALs are tracked. + ASSERT_FALSE(dbfull()->TEST_GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); + + // Disable WAL tracking. + options.track_and_verify_wals_in_manifest = false; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + // Previously tracked WALs are cleared. + ASSERT_TRUE(dbfull()->TEST_GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); + + // Re-enable WAL tracking again. + options.track_and_verify_wals_in_manifest = true; + options.create_if_missing = false; + ASSERT_OK(TryReopen(options)); + ASSERT_TRUE(dbfull()->TEST_GetVersionSet()->GetWalSet().GetWals().empty()); + Close(); +} #endif // !ROCKSDB_LITE TEST_F(DBBasicTest, ManifestChecksumMismatch) { @@ -2504,6 +2571,64 @@ TEST_F(DBBasicTest, ManifestChecksumMismatch) { ASSERT_TRUE(s.IsCorruption()); } +#ifndef ROCKSDB_LITE +class DBBasicTestTrackWal : public DBTestBase, + public testing::WithParamInterface { + public: + DBBasicTestTrackWal() + : DBTestBase("/db_basic_test_track_wal", /*env_do_fsync=*/false) {} + + int CountWalFiles() { + VectorLogPtr log_files; + EXPECT_OK(dbfull()->GetSortedWalFiles(log_files)); + return static_cast(log_files.size()); + }; +}; + +TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) { + // If a WAL becomes obsolete after flushing, but is not deleted from disk yet, + // then if SyncWAL is called afterwards, the obsolete WAL should not be + // tracked in MANIFEST. + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.track_and_verify_wals_in_manifest = true; + options.atomic_flush = GetParam(); + + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf"}, options); + ASSERT_EQ(handles_.size(), 2); // default, cf + // Do not delete WALs. + ASSERT_OK(db_->DisableFileDeletions()); + constexpr int n = 10; + std::vector> wals(n); + for (size_t i = 0; i < n; i++) { + // Generate a new WAL for each key-value. + const int cf = i % 2; + ASSERT_OK(db_->GetCurrentWalFile(&wals[i])); + ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i))); + ASSERT_OK(Flush({0, 1})); + } + ASSERT_EQ(CountWalFiles(), n); + // Since all WALs are obsolete, no WAL should be tracked in MANIFEST. + ASSERT_OK(db_->SyncWAL()); + + // Manually delete all WALs. + Close(); + for (const auto& wal : wals) { + ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber()))); + } + + // If SyncWAL tracks the obsolete WALs in MANIFEST, + // reopen will fail because the WALs are missing from disk. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options)); + Destroy(options); +} + +INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal, + testing::Bool()); +#endif // ROCKSDB_LITE + class DBBasicTestMultiGet : public DBTestBase { public: DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache, @@ -2563,12 +2688,13 @@ class DBBasicTestMultiGet : public DBTestBase { table_options.block_cache_compressed = compressed_cache_; table_options.flush_block_policy_factory.reset( new MyFlushBlockPolicyFactory()); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); if (!compression_enabled_) { options.compression = kNoCompression; } else { options.compression_opts.parallel_threads = compression_parallel_threads; } + options_ = options; Reopen(options); if (num_cfs > 1) { @@ -2589,9 +2715,9 @@ class DBBasicTestMultiGet : public DBTestBase { : Put(cf, Key(i), values_[i])) == Status::OK()); } if (num_cfs == 1) { - Flush(); + EXPECT_OK(Flush()); } else { - dbfull()->Flush(FlushOptions(), handles_[cf]); + EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); } for (int i = 0; i < 100; ++i) { @@ -2603,9 +2729,9 @@ class DBBasicTestMultiGet : public DBTestBase { Status::OK()); } if (num_cfs == 1) { - Flush(); + EXPECT_OK(Flush()); } else { - dbfull()->Flush(FlushOptions(), handles_[cf]); + EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); } } } @@ -2638,6 +2764,7 @@ class DBBasicTestMultiGet : public DBTestBase { bool compression_enabled() { return compression_enabled_; } bool has_compressed_cache() { return compressed_cache_ != nullptr; } bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; } + Options get_options() { return options_; } static void SetUpTestCase() {} static void TearDownTestCase() {} @@ -2693,6 +2820,7 @@ class DBBasicTestMultiGet : public DBTestBase { const char* Name() const override { return "MyBlockCache"; } + using Cache::Insert; Status Insert(const Slice& key, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), Handle** handle = nullptr, @@ -2701,6 +2829,7 @@ class DBBasicTestMultiGet : public DBTestBase { return target_->Insert(key, value, charge, deleter, handle, priority); } + using Cache::Lookup; Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { num_lookups_++; Handle* handle = target_->Lookup(key, stats); @@ -2723,6 +2852,7 @@ class DBBasicTestMultiGet : public DBTestBase { std::shared_ptr compressed_cache_; std::shared_ptr uncompressed_cache_; + Options options_; bool compression_enabled_; std::vector values_; std::vector uncompressable_values_; @@ -2865,6 +2995,123 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) { } } +#ifndef ROCKSDB_LITE +TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { + class FakeDirectIOEnv : public EnvWrapper { + class FakeDirectIOSequentialFile; + class FakeDirectIORandomAccessFile; + + public: + FakeDirectIOEnv(Env* env) : EnvWrapper(env) {} + + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override { + std::unique_ptr file; + assert(options.use_direct_reads); + EnvOptions opts = options; + opts.use_direct_reads = false; + Status s = target()->NewRandomAccessFile(fname, &file, opts); + if (!s.ok()) { + return s; + } + result->reset(new FakeDirectIORandomAccessFile(std::move(file))); + return s; + } + + private: + class FakeDirectIOSequentialFile : public SequentialFileWrapper { + public: + FakeDirectIOSequentialFile(std::unique_ptr&& file) + : SequentialFileWrapper(file.get()), file_(std::move(file)) {} + ~FakeDirectIOSequentialFile() {} + + bool use_direct_io() const override { return true; } + size_t GetRequiredBufferAlignment() const override { return 1; } + + private: + std::unique_ptr file_; + }; + + class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper { + public: + FakeDirectIORandomAccessFile(std::unique_ptr&& file) + : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {} + ~FakeDirectIORandomAccessFile() {} + + bool use_direct_io() const override { return true; } + size_t GetRequiredBufferAlignment() const override { return 1; } + + private: + std::unique_ptr file_; + }; + }; + + std::unique_ptr env(new FakeDirectIOEnv(env_)); + Options opts = get_options(); + opts.env = env.get(); + opts.use_direct_reads = true; + Reopen(opts); + + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + if (uncompressed_cache_) { + uncompressed_cache_->SetCapacity(0); + uncompressed_cache_->SetCapacity(1048576); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + bool read_from_cache = false; + if (fill_cache()) { + if (has_uncompressed_cache()) { + read_from_cache = true; + } else if (has_compressed_cache() && compression_enabled()) { + read_from_cache = true; + } + } + + int expected_reads = random_reads; + if (!compression_enabled() || !has_compressed_cache()) { + expected_reads += 2; + } else { + expected_reads += (read_from_cache ? 0 : 2); + } + if (env_->random_read_counter_.Read() != expected_reads) { + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + } + Close(); +} +#endif // ROCKSDB_LITE + TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { std::vector key_data(10); std::vector keys; @@ -2990,7 +3237,7 @@ class DeadlineFS : public FileSystemWrapper { // or to simply delay but return success anyway. The latter mimics the // behavior of PosixFileSystem, which does not enforce any timeout explicit DeadlineFS(SpecialEnv* env, bool error_on_delay) - : FileSystemWrapper(FileSystem::Default()), + : FileSystemWrapper(env->GetFileSystem()), deadline_(std::chrono::microseconds::zero()), io_timeout_(std::chrono::microseconds::zero()), env_(env), @@ -3029,7 +3276,9 @@ class DeadlineFS : public FileSystemWrapper { // Increment the IO counter and return a delay in microseconds IOStatus ShouldDelay(const IOOptions& opts) { - if (!deadline_.count() && !io_timeout_.count()) { + if (timedout_) { + return IOStatus::TimedOut(); + } else if (!deadline_.count() && !io_timeout_.count()) { return IOStatus::OK(); } if (!ignore_deadline_ && delay_trigger_ == io_count_++) { @@ -3165,7 +3414,7 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { std::shared_ptr cache = NewLRUCache(1048576); BlockBasedTableOptions table_options; table_options.block_cache = cache; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.env = env.get(); SetTimeElapseOnlySleepOnReopen(&options); ReopenWithColumnFamilies(GetCFNames(), options); @@ -3325,17 +3574,40 @@ TEST_F(DBBasicTest, VerifyFileChecksums) { DestroyAndReopen(options); ASSERT_OK(Put("a", "value")); ASSERT_OK(Flush()); - ASSERT_TRUE(dbfull()->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); Reopen(options); - ASSERT_OK(dbfull()->VerifyFileChecksums(ReadOptions())); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); // Write an L0 with checksum computed. ASSERT_OK(Put("b", "value")); ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->VerifyFileChecksums(ReadOptions())); + ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); + + // Does the right thing but with the wrong name -- using it should lead to an + // error. + class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c { + public: + MisnamedFileChecksumGenerator(const FileChecksumGenContext& context) + : FileChecksumGenCrc32c(context) {} + + const char* Name() const override { return "sha1"; } + }; + + class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + return std::unique_ptr( + new MisnamedFileChecksumGenerator(context)); + } + }; + + options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory()); + Reopen(options); + ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); } #endif // !ROCKSDB_LITE @@ -3399,7 +3671,7 @@ TEST_P(DBBasicTestDeadline, PointLookupDeadline) { std::string key = "k" + ToString(i); ASSERT_OK(Put(key, rnd.RandomString(100))); } - Flush(); + ASSERT_OK(Flush()); bool timedout = true; // A timeout will be forced when the IO counter reaches this value diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index da37babcc26..6108bf5b728 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -7,10 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include +#include "cache/cache_entry_roles.h" #include "cache/lru_cache.h" #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/table.h" #include "util/compression.h" #include "util/random.h" @@ -147,6 +150,17 @@ class DBBlockCacheTest : public DBTestBase { compressed_insert_count_ = new_insert_count; compressed_failure_count_ = new_failure_count; } + +#ifndef ROCKSDB_LITE + const std::array& GetCacheEntryRoleCountsBg() { + // Verify in cache entry role stats + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats(); + return internal_stats_ptr->TEST_GetCacheEntryRoleStats(/*foreground=*/false) + .entry_counts; + } +#endif // ROCKSDB_LITE }; TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) { @@ -228,34 +242,48 @@ TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) { #ifdef SNAPPY TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { - ReadOptions read_options; - auto table_options = GetTableOptions(); - auto options = GetOptions(table_options); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.block_cache_compressed = nullptr; + table_options.block_size = 1; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.compression = CompressionType::kSnappyCompression; - InitTable(options); - std::shared_ptr cache = NewLRUCache(0, 0, false); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + } + + ReadOptions read_options; std::shared_ptr compressed_cache = NewLRUCache(1 << 25, 0, false); + std::shared_ptr cache = NewLRUCache(0, 0, false); table_options.block_cache = cache; + table_options.no_block_cache = false; table_options.block_cache_compressed = compressed_cache; + table_options.max_auto_readahead_size = 0; + table_options.cache_index_and_filter_blocks = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); RecordCacheCounters(options); - std::vector> iterators(kNumBlocks - 1); - Iterator* iter = nullptr; - // Load blocks into cache. - for (size_t i = 0; i + 1 < kNumBlocks; i++) { - iter = db_->NewIterator(read_options); - iter->Seek(ToString(i)); - ASSERT_OK(iter->status()); + for (size_t i = 0; i < kNumBlocks - 1; i++) { + ASSERT_EQ(value, Get(ToString(i))); CheckCacheCounters(options, 1, 0, 1, 0); CheckCompressedCacheCounters(options, 1, 0, 1, 0); - iterators[i].reset(iter); } + size_t usage = cache->GetUsage(); - ASSERT_LT(0, usage); + ASSERT_EQ(0, usage); ASSERT_EQ(usage, cache->GetPinnedUsage()); size_t compressed_usage = compressed_cache->GetUsage(); ASSERT_LT(0, compressed_usage); @@ -267,24 +295,21 @@ TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { cache->SetCapacity(usage); cache->SetStrictCapacityLimit(true); ASSERT_EQ(usage, cache->GetPinnedUsage()); - iter = db_->NewIterator(read_options); - iter->Seek(ToString(kNumBlocks - 1)); - ASSERT_TRUE(iter->status().IsIncomplete()); - CheckCacheCounters(options, 1, 0, 0, 1); + + // Load last key block. + ASSERT_EQ("Result incomplete: Insert failed due to LRU cache being full.", + Get(ToString(kNumBlocks - 1))); + // Failure won't record the miss counter. + CheckCacheCounters(options, 0, 0, 0, 1); CheckCompressedCacheCounters(options, 1, 0, 1, 0); - delete iter; - iter = nullptr; // Clear strict capacity limit flag. This time we shall hit compressed block - // cache. + // cache and load into block cache. cache->SetStrictCapacityLimit(false); - iter = db_->NewIterator(read_options); - iter->Seek(ToString(kNumBlocks - 1)); - ASSERT_OK(iter->status()); + // Load last key block. + ASSERT_EQ(value, Get(ToString(kNumBlocks - 1))); CheckCacheCounters(options, 1, 0, 1, 0); CheckCompressedCacheCounters(options, 0, 1, 0, 0); - delete iter; - iter = nullptr; } #endif // SNAPPY @@ -432,6 +457,33 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { // filter_bytes_insert); } +#if (defined OS_LINUX || defined OS_WIN) +TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + table_options.cache_index_and_filter_blocks = false; + table_options.prepopulate_block_cache = + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + std::string value(kValueSize, 'a'); + for (size_t i = 1; i <= kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + + ASSERT_EQ(value, Get(ToString(i))); + ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT)); + } +} +#endif + namespace { // A mock cache wraps LRUCache, and record how many entries have been @@ -446,15 +498,18 @@ class MockCache : public LRUCache { false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) { } - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), Handle** handle, - Priority priority) override { + using ShardedCache::Insert; + + Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper_cb, size_t charge, + Handle** handle, Priority priority) override { + DeleterFn delete_cb = helper_cb->del_cb; if (priority == Priority::LOW) { low_pri_insert_count++; } else { high_pri_insert_count++; } - return LRUCache::Insert(key, value, charge, deleter, handle, priority); + return LRUCache::Insert(key, value, charge, delete_cb, handle, priority); } }; @@ -533,6 +588,7 @@ class LookupLiarCache : public CacheWrapper { explicit LookupLiarCache(std::shared_ptr target) : CacheWrapper(std::move(target)) {} + using Cache::Lookup; Handle* Lookup(const Slice& key, Statistics* stats) override { if (nth_lookup_not_found_ == 1) { nth_lookup_not_found_ = 0; @@ -677,7 +733,7 @@ TEST_F(DBBlockCacheTest, ParanoidFileChecks) { // Create a new SST file. This will further trigger a compaction // and generate another file. ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(3, /* Totally 3 files created up to now */ TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -692,7 +748,7 @@ TEST_F(DBBlockCacheTest, ParanoidFileChecks) { ASSERT_OK(Put(1, "1_key4", "val4")); ASSERT_OK(Put(1, "9_key4", "val4")); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(3, /* Totally 3 files created up to now */ TestGetTickerCount(options, BLOCK_CACHE_ADD)); } @@ -837,8 +893,9 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { Random rnd(301); for (auto compression_type : compression_types) { Options options = CurrentOptions(); - options.compression = compression_type; - options.compression_opts.max_dict_bytes = 4096; + options.bottommost_compression = compression_type; + options.bottommost_compression_opts.max_dict_bytes = 4096; + options.bottommost_compression_opts.enabled = true; options.create_if_missing = true; options.num_levels = 2; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); @@ -859,7 +916,7 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { } ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); @@ -886,8 +943,364 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { } } +static void ClearCache(Cache* cache) { + std::deque keys; + Cache::ApplyToAllEntriesOptions opts; + auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/, + Cache::DeleterFn /*deleter*/) { + keys.push_back(key.ToString()); + }; + cache->ApplyToAllEntries(callback, opts); + for (auto& k : keys) { + cache->Erase(k); + } +} + +TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { + const size_t capacity = size_t{1} << 25; + int iterations_tested = 0; + for (bool partition : {false, true}) { + for (std::shared_ptr cache : + {NewLRUCache(capacity), NewClockCache(capacity)}) { + if (!cache) { + // Skip clock cache when not supported + continue; + } + ++iterations_tested; + + Options options = CurrentOptions(); + SetTimeElapseOnlySleepOnReopen(&options); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.max_open_files = 13; + options.table_cache_numshardbits = 0; + // If this wakes up, it could interfere with test + options.stats_dump_period_sec = 0; + + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(50)); + if (partition) { + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + table_options.partition_filters = true; + } + table_options.metadata_cache_options.top_level_index_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.partition_pinning = + PinningTier::kNone; + table_options.metadata_cache_options.unpartitioned_pinning = + PinningTier::kNone; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + // Create a new table. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("zfoo", "value")); + ASSERT_OK(Put("zbar", "value")); + ASSERT_OK(Flush()); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + + // Fresh cache + ClearCache(cache.get()); + + std::array expected{}; + // For CacheEntryStatsCollector + expected[static_cast(CacheEntryRole::kMisc)] = 1; + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + std::array prev_expected = expected; + + // First access only filters + ASSERT_EQ("NOT_FOUND", Get("different from any key added")); + expected[static_cast(CacheEntryRole::kFilterBlock)] += 2; + if (partition) { + expected[static_cast(CacheEntryRole::kFilterMetaBlock)] += 2; + } + // Within some time window, we will get cached entry stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Not enough to force a miss + env_->MockSleepForSeconds(45); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + // Now access index and data block + ASSERT_EQ("value", Get("foo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + if (partition) { + // top-level + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + } + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Enough to force a miss + env_->MockSleepForSeconds(601); + // But inject a simulated long scan so that we need a longer + // interval to force a miss next time. + SyncPoint::GetInstance()->SetCallBack( + "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", + [this](void*) { + // To spend no more than 0.2% of time scanning, we would need + // interval of at least 10000s + env_->MockSleepForSeconds(20); + }); + SyncPoint::GetInstance()->EnableProcessing(); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // The same for other file + ASSERT_EQ("value", Get("zfoo")); + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + if (partition) { + // top-level + expected[static_cast(CacheEntryRole::kIndexBlock)]++; + } + expected[static_cast(CacheEntryRole::kDataBlock)]++; + // Because of the simulated long scan, this is not enough to force + // a miss + env_->MockSleepForSeconds(601); + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // But this is enough + env_->MockSleepForSeconds(10000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + prev_expected = expected; + + // Also check the GetProperty interface + std::map values; + ASSERT_TRUE( + db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values)); + + EXPECT_EQ( + ToString(expected[static_cast(CacheEntryRole::kIndexBlock)]), + values["count.index-block"]); + EXPECT_EQ( + ToString(expected[static_cast(CacheEntryRole::kDataBlock)]), + values["count.data-block"]); + EXPECT_EQ( + ToString(expected[static_cast(CacheEntryRole::kFilterBlock)]), + values["count.filter-block"]); + EXPECT_EQ( + ToString( + prev_expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + EXPECT_EQ(ToString(expected[static_cast(CacheEntryRole::kMisc)]), + values["count.misc"]); + + // Add one for kWriteBuffer + { + WriteBufferManager wbm(size_t{1} << 20, cache); + wbm.ReserveMem(1024); + expected[static_cast(CacheEntryRole::kWriteBuffer)]++; + // Now we check that the GetProperty interface is more agressive about + // re-scanning stats, but not totally aggressive. + // Within some time window, we will get cached entry stats + env_->MockSleepForSeconds(1); + EXPECT_EQ(ToString(prev_expected[static_cast( + CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + // Not enough for a "background" miss but enough for a "foreground" miss + env_->MockSleepForSeconds(45); + + ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, + &values)); + EXPECT_EQ( + ToString( + expected[static_cast(CacheEntryRole::kWriteBuffer)]), + values["count.write-buffer"]); + } + prev_expected = expected; + + // With collector pinned in cache, we should be able to hit + // even if the cache is full + ClearCache(cache.get()); + Cache::Handle* h = nullptr; + ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1, + GetNoopDeleterForRole(), + &h, Cache::Priority::HIGH)); + ASSERT_GT(cache->GetUsage(), cache->GetCapacity()); + expected = {}; + expected[static_cast(CacheEntryRole::kMisc)]++; + // Still able to hit on saved stats + EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg()); + // Enough to force a miss + env_->MockSleepForSeconds(1000); + EXPECT_EQ(expected, GetCacheEntryRoleCountsBg()); + + cache->Release(h); + } + EXPECT_GE(iterations_tested, 1); + } +} + #endif // ROCKSDB_LITE +class DBBlockCachePinningTest + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBBlockCachePinningTest() + : DBTestBase("/db_block_cache_test", /*env_do_fsync=*/false) {} + + void SetUp() override { + partition_index_and_filters_ = std::get<0>(GetParam()); + top_level_index_pinning_ = std::get<1>(GetParam()); + partition_pinning_ = std::get<2>(GetParam()); + unpartitioned_pinning_ = std::get<3>(GetParam()); + } + + bool partition_index_and_filters_; + PinningTier top_level_index_pinning_; + PinningTier partition_pinning_; + PinningTier unpartitioned_pinning_; +}; + +TEST_P(DBBlockCachePinningTest, TwoLevelDB) { + // Creates one file in L0 and one file in L1. Both files have enough data that + // their index and filter blocks are partitioned. The L1 file will also have + // a compression dictionary (those are trained only during compaction), which + // must be unpartitioned. + const int kKeySize = 32; + const int kBlockSize = 128; + const int kNumBlocksPerFile = 128; + const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize; + + Options options = CurrentOptions(); + // `kNoCompression` makes the unit test more portable. But it relies on the + // current behavior of persisting/accessing dictionary even when there's no + // (de)compression happening, which seems fairly likely to change over time. + options.compression = kNoCompression; + options.compression_opts.max_dict_bytes = 4 << 10; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.block_cache = NewLRUCache(1 << 20 /* capacity */); + table_options.block_size = kBlockSize; + table_options.metadata_block_size = kBlockSize; + table_options.cache_index_and_filter_blocks = true; + table_options.metadata_cache_options.top_level_index_pinning = + top_level_index_pinning_; + table_options.metadata_cache_options.partition_pinning = partition_pinning_; + table_options.metadata_cache_options.unpartitioned_pinning = + unpartitioned_pinning_; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10 /* bits_per_key */)); + if (partition_index_and_filters_) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.partition_filters = true; + } + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize))); + } + ASSERT_OK(Flush()); + if (i == 0) { + // Prevent trivial move so file will be rewritten with dictionary and + // reopened with L1's pinning settings. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + } + + // Clear all unpinned blocks so unpinned blocks will show up as cache misses + // when reading a key from a file. + table_options.block_cache->EraseUnRefEntries(); + + // Get base cache values + uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t compression_dict_misses = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + + // Read a key from the L0 file + Get(Key(kNumKeysPerFile)); + uint64_t expected_filter_misses = filter_misses; + uint64_t expected_index_misses = index_misses; + uint64_t expected_compression_dict_misses = compression_dict_misses; + if (partition_index_and_filters_) { + if (top_level_index_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + if (partition_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + } else { + if (unpartitioned_pinning_ == PinningTier::kNone) { + ++expected_filter_misses; + ++expected_index_misses; + } + } + if (unpartitioned_pinning_ == PinningTier::kNone) { + ++expected_compression_dict_misses; + } + ASSERT_EQ(expected_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(expected_index_misses, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(expected_compression_dict_misses, + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); + + // Clear all unpinned blocks so unpinned blocks will show up as cache misses + // when reading a key from a file. + table_options.block_cache->EraseUnRefEntries(); + + // Read a key from the L1 file + Get(Key(0)); + if (partition_index_and_filters_) { + if (top_level_index_pinning_ == PinningTier::kNone || + top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + if (partition_pinning_ == PinningTier::kNone || + partition_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + } else { + if (unpartitioned_pinning_ == PinningTier::kNone || + unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_filter_misses; + ++expected_index_misses; + } + } + if (unpartitioned_pinning_ == PinningTier::kNone || + unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) { + ++expected_compression_dict_misses; + } + ASSERT_EQ(expected_filter_misses, + TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(expected_index_misses, + TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(expected_compression_dict_misses, + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); +} + +INSTANTIATE_TEST_CASE_P( + DBBlockCachePinningTest, DBBlockCachePinningTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll), + ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar, + PinningTier::kAll))); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index d4610246181..1405b737a52 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -7,11 +7,15 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include + #include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" #include "table/block_based/filter_policy_internal.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -128,8 +132,8 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow trivial move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); numopen = TestGetTickerCount(options, NO_FILE_OPENS); cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); @@ -178,7 +182,7 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) { ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - dbfull()->Flush(fo); + ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); @@ -244,7 +248,7 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); - dbfull()->Flush(fo); + ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); @@ -297,7 +301,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - dbfull()->Flush(fo); + ASSERT_OK(dbfull()->Flush(fo)); Reopen(options); ASSERT_EQ("NOT_FOUND", Get("foo")); @@ -328,7 +332,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Reopen with both of whole key off and prefix extractor enabled. // Still no bloom filter should be used. @@ -351,7 +355,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { // ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); options.prefix_extractor.reset(); bbto.whole_key_filtering = true; @@ -364,7 +368,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { // not filtered out by key ranges. ASSERT_OK(dbfull()->Put(wo, "aaa", "")); ASSERT_OK(dbfull()->Put(wo, "zzz", "")); - Flush(); + ASSERT_OK(Flush()); // Now we have two files: // File 1: An older file with prefix bloom. @@ -467,7 +471,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { for (int i = 0; i < N; i += 100) { ASSERT_OK(Put(1, Key(i), Key(i))); } - Flush(1); + ASSERT_OK(Flush(1)); // Prevent auto compactions triggered by seeks env_->delay_sstable_sync_.store(true, std::memory_order_release); @@ -503,6 +507,21 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { ASSERT_LE(reads, 3 * N / 100); } +#ifndef ROCKSDB_LITE + // Sanity check some table properties + std::map props; + ASSERT_TRUE(db_->GetMapProperty( + handles_[1], DB::Properties::kAggregatedTableProperties, &props)); + uint64_t nkeys = N + N / 100; + uint64_t filter_size = ParseUint64(props["filter_size"]); + EXPECT_LE(filter_size, + (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8); + EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8); + + uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); + EXPECT_EQ(num_filter_entries, nkeys); +#endif // ROCKSDB_LITE + env_->delay_sstable_sync_.store(false, std::memory_order_release); Close(); } while (ChangeCompactOptions()); @@ -514,24 +533,24 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( std::make_tuple(BFP::kDeprecatedBlock, false, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); + std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestWithParam, ::testing::Values( std::make_tuple(BFP::kDeprecatedBlock, false, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), - std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); + std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatLatest, DBBloomFilterTestWithParam, ::testing::Values( std::make_tuple(BFP::kDeprecatedBlock, false, test::kLatestFormatVersion), - std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion), - std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion))); + std::make_tuple(BFP::kAutoBloom, true, test::kLatestFormatVersion), + std::make_tuple(BFP::kAutoBloom, false, test::kLatestFormatVersion))); #endif // ROCKSDB_VALGRIND_RUN TEST_F(DBBloomFilterTest, BloomFilterRate) { @@ -771,6 +790,14 @@ class LevelAndStyleCustomFilterPolicy : public FilterPolicy { const std::unique_ptr policy_otherwise_; }; +static std::map + table_file_creation_reason_to_string{ + {TableFileCreationReason::kCompaction, "kCompaction"}, + {TableFileCreationReason::kFlush, "kFlush"}, + {TableFileCreationReason::kMisc, "kMisc"}, + {TableFileCreationReason::kRecovery, "kRecovery"}, + }; + class TestingContextCustomFilterPolicy : public LevelAndStyleCustomFilterPolicy { public: @@ -783,11 +810,17 @@ class TestingContextCustomFilterPolicy const FilterBuildingContext& context) const override { test_report_ += "cf="; test_report_ += context.column_family_name; - test_report_ += ",cs="; + test_report_ += ",s="; test_report_ += OptionsHelper::compaction_style_to_string[context.compaction_style]; - test_report_ += ",lv="; - test_report_ += std::to_string(context.level_at_creation); + test_report_ += ",n="; + test_report_ += ToString(context.num_levels); + test_report_ += ",l="; + test_report_ += ToString(context.level_at_creation); + test_report_ += ",b="; + test_report_ += ToString(int{context.is_bottommost}); + test_report_ += ",r="; + test_report_ += table_file_creation_reason_to_string[context.reason]; test_report_ += "\n"; return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); @@ -805,18 +838,21 @@ class TestingContextCustomFilterPolicy } // namespace TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { + auto policy = std::make_shared(15, 8, 5); + Options options; for (bool fifo : {true, false}) { - Options options = CurrentOptions(); + options = CurrentOptions(); + options.max_open_files = fifo ? -1 : options.max_open_files; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.compaction_style = fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; BlockBasedTableOptions table_options; - auto policy = std::make_shared(15, 8, 5); table_options.filter_policy = policy; table_options.format_version = 5; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + TryReopen(options); CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); const int maxKey = 10000; @@ -827,16 +863,16 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); Flush(1); EXPECT_EQ(policy->DumpTestReport(), - fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" - : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" + : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); for (int i = maxKey / 2; i < maxKey; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } Flush(1); EXPECT_EQ(policy->DumpTestReport(), - fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" - : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n" + : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); // Check that they can be found for (int i = 0; i < maxKey; i++) { @@ -864,7 +900,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); EXPECT_EQ(policy->DumpTestReport(), - "cf=bob,cs=kCompactionStyleLevel,lv=1\n"); + "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n"); // Check that we now have one filter, about 9.2% FP rate (5 bits per key) for (int i = 0; i < maxKey; i++) { @@ -876,11 +912,25 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { EXPECT_GE(useful_count, maxKey * 0.90); EXPECT_LE(useful_count, maxKey * 0.91); } + } else { +#ifndef ROCKSDB_LITE + // Also try external SST file + { + std::string file_path = dbname_ + "/external.sst"; + SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]); + ASSERT_OK(sst_file_writer.Open(file_path)); + ASSERT_OK(sst_file_writer.Put("key", "value")); + ASSERT_OK(sst_file_writer.Finish()); + } + // Note: kCompactionStyleLevel is default, ignored if num_levels == -1 + EXPECT_EQ(policy->DumpTestReport(), + "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); +#endif } // Destroy ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - dbfull()->DestroyColumnFamilyHandle(handles_[1]); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); handles_[1] = nullptr; } } @@ -1444,9 +1494,9 @@ void PrefixScanInit(DBBloomFilterTest* dbtest) { snprintf(buf, sizeof(buf), "%02d______:end", 10); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); - dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, - nullptr); // move to level 1 + ASSERT_OK(dbtest->Flush()); + ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr, + nullptr)); // move to level 1 // GROUP 1 for (int i = 1; i <= small_range_sstfiles; i++) { @@ -1563,21 +1613,21 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { for (int key : keys) { ASSERT_OK(Put(1, Key(key), "val")); if (++num_inserted % 1000 == 0) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } ASSERT_OK(Put(1, Key(0), "val")); ASSERT_OK(Put(1, Key(numkeys), "val")); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0, 1) == 0) { // No Level 0 file. Create one. ASSERT_OK(Put(1, Key(0), "val")); ASSERT_OK(Put(1, Key(numkeys), "val")); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } for (int i = 1; i < numkeys; i += 2) { @@ -1682,7 +1732,8 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { BottommostLevelCompaction::kSkip; compact_options.change_level = true; compact_options.target_level = 7; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) + .IsNotSupported()); ASSERT_EQ(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); @@ -1714,10 +1765,10 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { int CountIter(std::unique_ptr& iter, const Slice& key) { int count = 0; - for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); - iter->Next()) { + for (iter->Seek(key); iter->Valid(); iter->Next()) { count++; } + EXPECT_OK(iter->status()); return count; } @@ -1730,6 +1781,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; Options options; options.create_if_missing = true; + options.env = CurrentOptions().env; options.prefix_extractor.reset(NewCappedPrefixTransform(4)); options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); @@ -1746,7 +1798,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_OK(Put("abcdxxx1", "val2")); ASSERT_OK(Put("abcdxxx2", "val3")); ASSERT_OK(Put("abcdxxx3", "val4")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { // prefix_extractor has not changed, BF will always be read Slice upper_bound("abce"); @@ -1860,6 +1912,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { for (auto bfp_impl : BFP::kAllFixedImpls) { int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; @@ -1903,7 +1956,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { ASSERT_OK(Put("foo4", "bar4")); ASSERT_OK(Put("foq5", "bar5")); ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { // BF is cappped:3 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); @@ -1927,7 +1980,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { ASSERT_OK(Put("foo7", "bar7")); ASSERT_OK(Put("foq8", "bar8")); ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { // BF is fixed:2 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); @@ -2038,10 +2091,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); - dbfull()->DestroyColumnFamilyHandle(handles_[2]); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2])); handles_[2] = nullptr; ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - dbfull()->DestroyColumnFamilyHandle(handles_[1]); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); handles_[1] = nullptr; iteration++; } @@ -2052,6 +2105,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { for (auto bfp_impl : BFP::kAllFixedImpls) { Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.disable_auto_compactions = true; @@ -2113,6 +2167,54 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { } } +TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) { + Options options = CurrentOptions(); + constexpr size_t kNumKeys = 10000; + static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000"); + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeys + 10)); + options.create_if_missing = true; + constexpr size_t kPrefixLength = 4; + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(50)); + bbto.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + bbto.block_size = 128; + bbto.metadata_block_size = 128; + bbto.partition_filters = true; + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + const std::string value(64, '\0'); + + WriteOptions write_opts; + write_opts.disableWAL = true; + for (size_t i = 0; i < kNumKeys; ++i) { + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i; + ASSERT_OK(db_->Put(write_opts, oss.str(), value)); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + // Use legacy, implicit prefix seek + read_opts.total_order_seek = false; + read_opts.auto_prefix_mode = false; + std::unique_ptr it(db_->NewIterator(read_opts)); + for (size_t i = 0; i < kNumKeys; ++i) { + // Seek with a key after each one added but with same prefix. One will + // surely cross a partition boundary. + std::ostringstream oss; + oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a"; + it->SeekForPrev(oss.str()); + ASSERT_OK(it->status()); + ASSERT_TRUE(it->Valid()); + } + it.reset(); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index fb9186caead..d5a8db1c3e2 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -42,7 +42,7 @@ class DBTestCompactionFilterWithCompactParam option_config_ == kUniversalSubcompactions) { assert(options.max_subcompactions > 1); } - TryReopen(options); + Reopen(options); } }; @@ -82,6 +82,11 @@ class DeleteFilter : public CompactionFilter { return true; } + bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const override { + return true; + } + const char* Name() const override { return "DeleteFilter"; } }; @@ -190,18 +195,36 @@ class KeepFilterFactory : public CompactionFilterFactory { bool compaction_filter_created_; }; +// This filter factory is configured with a `TableFileCreationReason`. Only +// table files created for that reason will undergo filtering. This +// configurability makes it useful to tests for filtering non-compaction table +// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery". class DeleteFilterFactory : public CompactionFilterFactory { public: + explicit DeleteFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) override { - if (context.is_manual_compaction) { - return std::unique_ptr(new DeleteFilter()); - } else { + EXPECT_EQ(reason_, context.reason); + if (context.reason == TableFileCreationReason::kCompaction && + !context.is_manual_compaction) { + // Table files created by automatic compaction do not undergo filtering. + // Presumably some tests rely on this. return std::unique_ptr(nullptr); } + return std::unique_ptr(new DeleteFilter()); + } + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; } const char* Name() const override { return "DeleteFilterFactory"; } + + private: + const TableFileCreationReason reason_; }; // Delete Filter Factory which ignores snapshots @@ -276,7 +299,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { for (int i = 0; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } ASSERT_OK(Flush(1)); @@ -284,10 +307,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // the compaction is each level invokes the filter for // all the keys in that level. cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); @@ -314,13 +337,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { ASSERT_OK(iter->status()); while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_OK(ParseInternalKey(iter->key(), &ikey)); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); total++; if (ikey.sequence != 0) { count++; } iter->Next(); } + ASSERT_OK(iter->status()); } ASSERT_EQ(total, 100000); ASSERT_EQ(count, 0); @@ -337,10 +361,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // means that all keys should pass at least once // via the compaction filter cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); @@ -348,7 +372,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // create a new database with the compaction // filter in such a way that it deletes all keys - options.compaction_filter_factory = std::make_shared(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); options.create_if_missing = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -369,10 +394,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // verify that at the end of the compaction process, // nothing is left. cfilter_count = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 100000); cfilter_count = 0; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); ASSERT_EQ(cfilter_count, 0); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); @@ -387,6 +412,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { count++; iter->Next(); } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 0); } @@ -405,7 +431,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { ASSERT_OK(iter->status()); while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_OK(ParseInternalKey(iter->key(), &ikey)); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); ASSERT_NE(ikey.sequence, (unsigned)0); count++; iter->Next(); @@ -419,7 +445,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // entries in VersionEdit, but none of the 'AddFile's. TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { Options options = CurrentOptions(); - options.compaction_filter_factory = std::make_shared(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); options.disable_auto_compactions = true; options.create_if_missing = true; DestroyAndReopen(options); @@ -427,9 +454,9 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { // put some data for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); } - Flush(); + ASSERT_OK(Flush()); } // this will produce empty file (delete compaction filter) @@ -440,6 +467,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { Iterator* itr = db_->NewIterator(ReadOptions()); itr->SeekToFirst(); + ASSERT_OK(itr->status()); // empty db ASSERT_TRUE(!itr->Valid()); @@ -447,6 +475,64 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { } #endif // ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilterFlush) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by flush. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kFlush); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + +TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by recovery. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kRecovery); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + TEST_P(DBTestCompactionFilterWithCompactParam, CompactionFilterWithValueChange) { Options options = CurrentOptions(); @@ -463,25 +549,25 @@ TEST_P(DBTestCompactionFilterWithCompactParam, for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } // push all files to lower levels ASSERT_OK(Flush(1)); if (option_config_ != kUniversalCompactionMultiLevel && option_config_ != kUniversalSubcompactions) { - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); } else { - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); } // re-write all data again for (int i = 0; i < 100001; i++) { char key[100]; snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } // push all files to lower levels. This should @@ -489,11 +575,11 @@ TEST_P(DBTestCompactionFilterWithCompactParam, ASSERT_OK(Flush(1)); if (option_config_ != kUniversalCompactionMultiLevel && option_config_ != kUniversalSubcompactions) { - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1])); } else { - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, - nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], + nullptr, nullptr)); } // verify that all keys now have the new value that @@ -531,7 +617,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { ASSERT_OK(Flush()); std::string newvalue = Get("foo"); ASSERT_EQ(newvalue, three); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("foo"); ASSERT_EQ(newvalue, three); @@ -539,12 +625,12 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { // merge keys. ASSERT_OK(db_->Put(WriteOptions(), "bar", two)); ASSERT_OK(Flush()); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("bar"); ASSERT_EQ("NOT_FOUND", newvalue); ASSERT_OK(db_->Merge(WriteOptions(), "bar", two)); ASSERT_OK(Flush()); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("bar"); ASSERT_EQ(two, two); @@ -555,7 +641,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { ASSERT_OK(Flush()); newvalue = Get("foobar"); ASSERT_EQ(newvalue, three); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("foobar"); ASSERT_EQ(newvalue, three); @@ -568,7 +654,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { ASSERT_OK(Flush()); newvalue = Get("barfoo"); ASSERT_EQ(newvalue, four); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); newvalue = Get("barfoo"); ASSERT_EQ(newvalue, four); } @@ -590,21 +676,21 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { for (int i = 0; i < num_keys_per_file; i++) { char key[100]; snprintf(key, sizeof(key), "B%08d%02d", i, j); - Put(key, value); + ASSERT_OK(Put(key, value)); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Make sure next file is much smaller so automatic compaction will not // be triggered. num_keys_per_file /= 2; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Force a manual compaction cfilter_count = 0; filter->expect_manual_compaction_.store(true); filter->expect_full_compaction_.store(true); filter->expect_cf_id_.store(0); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(cfilter_count, 700); ASSERT_EQ(NumSortedRuns(0), 1); ASSERT_TRUE(filter->compaction_filter_created()); @@ -624,7 +710,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { ASSERT_OK(iter->status()); while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_OK(ParseInternalKey(iter->key(), &ikey)); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); total++; if (ikey.sequence != 0) { count++; @@ -654,14 +740,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { for (int i = 0; i < num_keys_per_file; i++) { char key[100]; snprintf(key, sizeof(key), "B%08d%02d", i, j); - Put(1, key, value); + ASSERT_OK(Put(1, key, value)); } - Flush(1); + ASSERT_OK(Flush(1)); // Make sure next file is much smaller so automatic compaction will not // be triggered. num_keys_per_file /= 2; } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(filter->compaction_filter_created()); } @@ -680,9 +766,9 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { const Snapshot* snapshot = nullptr; for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10; ++i) { - Put(ToString(table * 100 + i), "val"); + ASSERT_OK(Put(ToString(table * 100 + i), "val")); } - Flush(); + ASSERT_OK(Flush()); if (table == 0) { snapshot = db_->GetSnapshot(); @@ -702,6 +788,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { read_options.snapshot = snapshot; std::unique_ptr iter(db_->NewIterator(read_options)); iter->SeekToFirst(); + ASSERT_OK(iter->status()); int count = 0; while (iter->Valid()) { count++; @@ -710,6 +797,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { ASSERT_EQ(count, 6); read_options.snapshot = nullptr; std::unique_ptr iter1(db_->NewIterator(read_options)); + ASSERT_OK(iter1->status()); iter1->SeekToFirst(); count = 0; while (iter1->Valid()) { @@ -740,9 +828,9 @@ TEST_F(DBTestCompactionFilter, SkipUntil) { for (int i = table * 6; i < 39 + table * 11; ++i) { char key[100]; snprintf(key, sizeof(key), "%010d", table * 100 + i); - Put(key, std::to_string(table * 1000 + i)); + ASSERT_OK(Put(key, std::to_string(table * 1000 + i))); } - Flush(); + ASSERT_OK(Flush()); } cfilter_skips = 0; @@ -781,10 +869,10 @@ TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) { options.create_if_missing = true; DestroyAndReopen(options); - Put("0000000010", "v10"); - Put("0000000020", "v20"); // skipped - Put("0000000050", "v50"); - Flush(); + ASSERT_OK(Put("0000000010", "v10")); + ASSERT_OK(Put("0000000020", "v20")); // skipped + ASSERT_OK(Put("0000000050", "v50")); + ASSERT_OK(Flush()); cfilter_skips = 0; EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -822,13 +910,13 @@ TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) { options.compaction_filter = new TestNotSupportedFilter(); DestroyAndReopen(options); - Put("a", "v10"); - Put("z", "v20"); - Flush(); + ASSERT_OK(Put("a", "v10")); + ASSERT_OK(Put("z", "v20")); + ASSERT_OK(Flush()); - Put("a", "v10"); - Put("z", "v20"); - Flush(); + ASSERT_OK(Put("a", "v10")); + ASSERT_OK(Put("z", "v20")); + ASSERT_OK(Flush()); // Comapction should fail because IgnoreSnapshots() = false EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr) @@ -837,6 +925,49 @@ TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) { delete options.compaction_filter; } +class TestNotSupportedFilterFactory : public CompactionFilterFactory { + public: + explicit TestNotSupportedFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /* context */) override { + return std::unique_ptr(new TestNotSupportedFilter()); + } + + const char* Name() const override { return "TestNotSupportedFilterFactory"; } + + private: + const TableFileCreationReason reason_; +}; + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kFlush); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(Flush().IsNotSupported()); +} + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kRecovery); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index aac39d980d9..6cb1abfab16 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -7,7 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/blob/blob_index.h" #include "db/db_test_util.h" +#include "env/mock_env.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/concurrent_task_limiter.h" @@ -18,6 +22,7 @@ #include "util/concurrent_task_limiter_impl.h" #include "util/random.h" #include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -176,6 +181,7 @@ Options DeletionTriggerOptions(Options options) { options.target_file_size_base * options.target_file_size_multiplier; options.max_bytes_for_level_multiplier = 2; options.disable_auto_compactions = false; + options.compaction_options_universal.max_size_amplification_percent = 100; return options; } @@ -251,7 +257,7 @@ void VerifyCompactionStats(ColumnFamilyData& cfd, const CompactionStatsCollector& collector) { #ifndef NDEBUG InternalStats* internal_stats_ptr = cfd.internal_stats(); - ASSERT_TRUE(internal_stats_ptr != nullptr); + ASSERT_NE(internal_stats_ptr, nullptr); const std::vector& comp_stats = internal_stats_ptr->TEST_GetCompactionStats(); const int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); @@ -323,19 +329,41 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) { values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[0] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[1] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); - // must have much smaller db size. - ASSERT_GT(db_size[0] / 3, db_size[1]); + if (options.compaction_style == kCompactionStyleUniversal) { + // Claim: in universal compaction none of the original data will remain + // once compactions settle. + // + // Proof: The compensated size of the file containing the most tombstones + // is enough on its own to trigger size amp compaction. Size amp + // compaction is a full compaction, so all tombstones meet the obsolete + // keys they cover. + ASSERT_EQ(0, db_size[1]); + } else { + // Claim: in level compaction at most `db_size[0] / 2` of the original + // data will remain once compactions settle. + // + // Proof: Assume the original data is all in the bottom level. If it were + // not, it would meet its tombstone sooner. The original data size is + // large enough to require fanout to bottom level to be greater than + // `max_bytes_for_level_multiplier == 2`. In the level just above, + // tombstones must cover less than `db_size[0] / 4` bytes since fanout >= + // 2 and file size is compensated by doubling the size of values we expect + // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in + // levels above must cover less than `db_size[0] / 8` bytes of original + // data, `db_size[0] / 16`, and so on. + ASSERT_GT(db_size[0] / 2, db_size[1]); + } } } #endif // ROCKSDB_VALGRIND_RUN @@ -382,8 +410,9 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) { cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_TRUE( + dbfull()->CompactRange(cro, nullptr, nullptr).IsInvalidArgument()); // check that normal user iterator doesn't see anything Iterator* db_iter = dbfull()->NewIterator(ReadOptions()); @@ -391,6 +420,7 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) { for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { i++; } + ASSERT_OK(db_iter->status()); ASSERT_EQ(i, 0); delete db_iter; @@ -398,6 +428,7 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) { ReadOptions ro; ro.iter_start_seqnum=1; db_iter = dbfull()->NewIterator(ro); + ASSERT_OK(db_iter->status()); i = 0; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { i++; @@ -407,9 +438,10 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) { // now all deletes should be gone SetPreserveDeletesSequenceNumber(100000000); - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr)); db_iter = dbfull()->NewIterator(ro); + ASSERT_TRUE(db_iter->status().IsInvalidArgument()); i = 0; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { i++; @@ -471,6 +503,10 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { options.new_table_reader_for_compaction_inputs = true; options.max_open_files = 20; options.level0_file_num_compaction_trigger = 3; + // Avoid many shards with small max_open_files, where as little as + // two table insertions could lead to an LRU eviction, depending on + // hash values. + options.table_cache_numshardbits = 2; DestroyAndReopen(options); Random rnd(301); @@ -495,8 +531,8 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { ASSERT_OK(Put(Key(10 - k), "bar")); if (k < options.level0_file_num_compaction_trigger - 1) { num_table_cache_lookup = 0; - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // preloading iterator issues one table cache lookup and create // a new table reader, if not preloaded. int old_num_table_cache_lookup = num_table_cache_lookup; @@ -514,8 +550,8 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { num_table_cache_lookup = 0; num_new_table_reader = 0; - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Preloading iterator issues one table cache lookup and creates // a new table reader. One file is created for flush and one for compaction. // Compaction inputs make no table cache look-up for data/range deletion @@ -542,7 +578,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { cro.change_level = true; cro.target_level = 2; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); // Only verifying compaction outputs issues one table cache lookup // for both data block and range deletion block). // May preload table cache too. @@ -583,9 +619,9 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[0] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); Close(); // round 2 --- disable auto-compactions and issue deletions. @@ -596,11 +632,10 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); } - db_size[1] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); Close(); - // as auto_compaction is off, we shouldn't see too much reduce - // in db size. - ASSERT_LT(db_size[0] / 3, db_size[1]); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); // round 3 --- reopen db with auto_compaction on and see if // deletion compensation still work. @@ -610,11 +645,17 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) { for (int k = 0; k < kTestSize / 10; ++k) { ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[2] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); // this time we're expecting significant drop in size. - ASSERT_GT(db_size[0] / 3, db_size[2]); + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. In addition to that, this + // test inserts `db_size[0] / 10` to push the tombstones into SST files and + // then through automatic compactions. So in total `3 * db_size[0] / 5` of + // the original data may remain. + ASSERT_GT(3 * db_size[0] / 5, db_size[2]); } } @@ -630,7 +671,7 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) { CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); } ASSERT_EQ("0,0,3", FilesPerLevel(0)); @@ -663,7 +704,7 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) { }); SyncPoint::GetInstance()->EnableProcessing(); env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(1, low_pri_count); ASSERT_EQ(1, bottom_pri_count); ASSERT_EQ("0,0,2", FilesPerLevel(0)); @@ -671,12 +712,12 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) { // Recompact bottom most level uses bottom pool CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ(1, low_pri_count); ASSERT_EQ(2, bottom_pri_count); env_->SetBackgroundThreads(0, Env::Priority::BOTTOM); - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); // Low pri pool is used if bottom pool has size 0. ASSERT_EQ(2, low_pri_count); ASSERT_EQ(2, bottom_pri_count); @@ -701,9 +742,16 @@ TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { values.push_back(rnd.RandomString(kCDTValueSize)); ASSERT_OK(Put(Key(k), values[k])); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[0] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // L1 and L2 can fit deletions iff size compensation does not take effect, + // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining + // files at or above L2 down to L3 to ensure obsolete data does not + // accidentally meet its tombstone above L3. This makes the final size more + // deterministic and easy to see whether size compensation for deletions + // took effect. + MoveFilesToLevel(3 /* level */); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0])); Close(); // round 2 --- disable auto-compactions and issue deletions. @@ -716,27 +764,33 @@ TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); } - db_size[1] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1])); Close(); - // as auto_compaction is off, we shouldn't see too much reduce - // in db size. - ASSERT_LT(db_size[0] / 3, db_size[1]); + // as auto_compaction is off, we shouldn't see any reduction in db size. + ASSERT_LE(db_size[0], db_size[1]); // round 3 --- reopen db with auto_compaction on and see if // deletion compensation still work. options.disable_auto_compactions = false; Reopen(options); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - db_size[2] = Size(Key(0), Key(kTestSize - 1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2])); if (options.skip_stats_update_on_db_open) { // If update stats on DB::Open is disable, we don't expect // deletion entries taking effect. - ASSERT_LT(db_size[0] / 3, db_size[2]); + // + // The deletions are small enough to fit in L1 and L2, and obsolete keys + // were moved to L3+, so none of the original data should have been + // dropped. + ASSERT_LE(db_size[0], db_size[2]); } else { // Otherwise, we should see a significant drop in db size. - ASSERT_GT(db_size[0] / 3, db_size[2]); + // + // See "CompactionDeletionTrigger" test for proof that at most + // `db_size[0] / 2` of the original data remains. + ASSERT_GT(db_size[0] / 2, db_size[2]); } } } @@ -766,7 +820,7 @@ TEST_P(DBCompactionTestWithParam, CompactionTrigger) { } // put extra key to trigger flush ASSERT_OK(Put(1, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); } @@ -778,7 +832,7 @@ TEST_P(DBCompactionTestWithParam, CompactionTrigger) { } // put extra key to trigger flush ASSERT_OK(Put(1, "", "")); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); @@ -820,7 +874,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { } // put extra key to trigger flush ASSERT_OK(Put(cf, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); } } @@ -837,7 +891,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { } // put extra key to trigger flush ASSERT_OK(Put(2, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, NumTableFilesAtLevel(0, 2)); } @@ -848,7 +902,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { sleeping_tasks[i].WakeUp(); sleeping_tasks[i].WaitUntilDone(); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify number of compactions allowed will come back to 1. @@ -865,7 +919,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { } // put extra key to trigger flush ASSERT_OK(Put(cf, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); } } @@ -898,8 +952,8 @@ TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { // Reopening moves updates to level-0 ReopenWithColumnFamilies({"default", "pikachu"}, options); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], - true /* disallow trivial move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], + true /* disallow trivial move */)); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_GT(NumTableFilesAtLevel(1, 1), 1); @@ -943,27 +997,27 @@ TEST_F(DBCompactionTest, UserKeyCrossFile1) { DestroyAndReopen(options); // create first file and flush to l0 - Put("4", "A"); - Put("3", "A"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - - Put("2", "A"); - Delete("3"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("NOT_FOUND", Get("3")); // move both files down to l1 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("NOT_FOUND", Get("3")); for (int i = 0; i < 3; i++) { - Put("2", "B"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("NOT_FOUND", Get("3")); } @@ -976,27 +1030,27 @@ TEST_F(DBCompactionTest, UserKeyCrossFile2) { DestroyAndReopen(options); // create first file and flush to l0 - Put("4", "A"); - Put("3", "A"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - - Put("2", "A"); - SingleDelete("3"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("4", "A")); + ASSERT_OK(Put("3", "A")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + ASSERT_OK(Put("2", "A")); + ASSERT_OK(SingleDelete("3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ("NOT_FOUND", Get("3")); // move both files down to l1 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("NOT_FOUND", Get("3")); for (int i = 0; i < 3; i++) { - Put("2", "B"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("2", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("NOT_FOUND", Get("3")); } @@ -1012,17 +1066,17 @@ TEST_F(DBCompactionTest, CompactionSstPartitioner) { DestroyAndReopen(options); // create first file and flush to l0 - Put("aaaa1", "A"); - Put("bbbb1", "B"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - Put("aaaa1", "A2"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("aaaa1", "A2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // move both files down to l1 - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); std::vector files; dbfull()->GetLiveFilesMetaData(&files); @@ -1042,11 +1096,11 @@ TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { DestroyAndReopen(options); // create first file and flush to l0 - Put("aaaa1", "A"); - Put("bbbb1", "B"); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(Put("aaaa1", "A")); + ASSERT_OK(Put("bbbb1", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); std::vector files; dbfull()->GetLiveFilesMetaData(&files); @@ -1076,22 +1130,23 @@ TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { // create first file and flush to l0 for (auto& key : {"1", "2", "3", "3", "3", "3"}) { - Put(key, std::string(key_len, 'A')); + ASSERT_OK(Put(key, std::string(key_len, 'A'))); snaps.push_back(dbfull()->GetSnapshot()); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // create second file and flush to l0 for (auto& key : {"3", "4", "5", "6", "7", "8"}) { - Put(key, std::string(key_len, 'A')); + ASSERT_OK(Put(key, std::string(key_len, 'A'))); snaps.push_back(dbfull()->GetSnapshot()); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // move both files down to l1 - dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1); + ASSERT_OK( + dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1)); // release snap so that first instance of key(3) can have seqId=0 for (auto snap : snaps) { @@ -1100,12 +1155,12 @@ TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { // create 3 files in l0 so to trigger compaction for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) { - Put("2", std::string(1, 'A')); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put("2", std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(Put("", "")); } @@ -1120,12 +1175,12 @@ TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) { for (int i = 0; i < 2; ++i) { for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) { // make l0 files' ranges overlap to avoid trivial move - Put(std::to_string(2 * i), std::string(1, 'A')); - Put(std::to_string(2 * i + 1), std::string(1, 'A')); - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A'))); + ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A'))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1); } @@ -1141,7 +1196,7 @@ TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) { // note CompactionOptions::output_file_size_limit is unset. CompactionOptions compact_opt; compact_opt.compression = kNoCompression; - dbfull()->CompactFiles(compact_opt, input_filenames, 1); + ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1)); } // Check that writes done during a memtable compaction are recovered @@ -1202,7 +1257,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) { cro.exclusive_manual_compaction = exclusive_manual_compaction_; // Compaction will initiate a trivial move from L0 to L1 - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); // File moved From L0 to L1 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); // 0 files in L0 @@ -1271,7 +1326,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { // Since data is non-overlapping we expect compaction to initiate // a trivial move - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); // We expect that all the files were trivially moved from L0 to L1 ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files); @@ -1308,7 +1363,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { ASSERT_OK(Flush()); } - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { @@ -1512,8 +1567,8 @@ TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) { ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0)); TEST_SYNC_POINT("DBCompaction::ManualPartial:5"); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // After two non-trivial compactions are installed, there is 1 file in L6, and // 1 file in L1 ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0)); @@ -1626,7 +1681,7 @@ TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) { for (int32_t j = 300; j < 4300; j++) { if (j == 2300) { ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); @@ -1642,8 +1697,8 @@ TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) { } TEST_SYNC_POINT("DBCompaction::PartialFill:2"); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); threads.join(); for (int32_t i = 0; i < 4300; i++) { @@ -1661,12 +1716,12 @@ TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) { Options options = CurrentOptions(); options.unordered_write = true; DestroyAndReopen(options); - Put("foo", "v1"); + ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Flush()); - Put("bar", "v1"); + ASSERT_OK(Put("bar", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - port::Thread writer([&]() { Put("foo", "v2"); }); + port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); }); TEST_SYNC_POINT( "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"); @@ -1732,15 +1787,15 @@ TEST_F(DBCompactionTest, DeleteFileRange) { for (int32_t j = 300; j < 4300; j++) { if (j == 2300) { ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } values[j] = rnd.RandomString(value_size); ASSERT_OK(Put(Key(j), values[j])); } } ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify level sizes uint64_t target_size = 4 * options.max_bytes_for_level_base; @@ -1750,7 +1805,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) { options.max_bytes_for_level_multiplier); } - size_t old_num_files = CountFiles(); + const size_t old_num_files = CountFiles(); std::string begin_string = Key(1000); std::string end_string = Key(2000); Slice begin(begin_string); @@ -1785,7 +1840,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) { compact_options.change_level = true; compact_options.target_level = 1; ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK( DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr)); @@ -1794,12 +1849,11 @@ TEST_F(DBCompactionTest, DeleteFileRange) { for (int32_t i = 0; i < 4300; i++) { ReadOptions roptions; std::string result; - Status s = db_->Get(roptions, Key(i), &result); - ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound()); deleted_count2++; } ASSERT_GT(deleted_count2, deleted_count); - size_t new_num_files = CountFiles(); + const size_t new_num_files = CountFiles(); ASSERT_GT(old_num_files, new_num_files); } @@ -1954,14 +2008,14 @@ TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { std::string vals[kNumL0Files]; for (int i = 0; i < kNumL0Files; ++i) { vals[i] = rnd.RandomString(kValSize); - Put(Key(i), vals[i]); - Put(Key(i + 1), vals[i]); - Flush(); + ASSERT_OK(Put(Key(i), vals[i])); + ASSERT_OK(Put(Key(i + 1), vals[i])); + ASSERT_OK(Flush()); if (i == 0) { snapshot = db_->GetSnapshot(); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify `DeleteFilesInRange` can't drop only file 0 which would cause // "1 -> vals[0]" to reappear. @@ -2048,16 +2102,8 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) { options.num_levels = 4; options.max_bytes_for_level_base = 400 * 1024; options.max_subcompactions = max_subcompactions_; - // options = CurrentOptions(options); - std::vector filenames; - env_->GetChildren(options.db_paths[1].path, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); - } - env_->DeleteDir(options.db_paths[1].path); - Reopen(options); + DestroyAndReopen(options); Random rnd(301); int key_idx = 0; @@ -2165,16 +2211,8 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) { options.num_levels = 4; options.max_bytes_for_level_base = 400 * 1024; options.max_subcompactions = max_subcompactions_; - // options = CurrentOptions(options); - std::vector filenames; - env_->GetChildren(options.db_paths[1].path, &filenames); - // Delete archival files. - for (size_t i = 0; i < filenames.size(); ++i) { - env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); - } - env_->DeleteDir(options.db_paths[1].path); - Reopen(options); + DestroyAndReopen(options); Random rnd(301); int key_idx = 0; @@ -2411,7 +2449,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); } ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(TotalTableFiles(1, 4), 1); int non_level0_num_files = 0; @@ -2447,7 +2485,8 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { compact_options.bottommost_level_compaction = BottommostLevelCompaction::kForce; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK( + dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr)); // Only 1 file in L0 ASSERT_EQ("1", FilesPerLevel(1)); @@ -2468,9 +2507,9 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000))); } - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = 1; i < options.num_levels; i++) { ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); @@ -2480,6 +2519,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { // compaction style std::string keys_in_db; Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]); + ASSERT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { keys_in_db.append(iter->key().ToString()); keys_in_db.push_back(','); @@ -2517,24 +2557,24 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) { TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Delete(1, "e"); - Put(1, "", ""); + ASSERT_OK(Delete(1, "e")); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "c", "cv"); + ASSERT_OK(Put(1, "c", "cv")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "d", "dv"); + ASSERT_OK(Put(1, "d", "dv")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Put(1, "", ""); + ASSERT_OK(Put(1, "", "")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); - Delete(1, "d"); - Delete(1, "b"); + ASSERT_OK(Delete(1, "d")); + ASSERT_OK(Delete(1, "b")); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("(->)(c->cv)", Contents(1)); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish @@ -2551,34 +2591,35 @@ TEST_F(DBCompactionTest, ManualAutoRace) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put(1, "foo", ""); - Put(1, "bar", ""); - Flush(1); - Put(1, "foo", ""); - Put(1, "bar", ""); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put(1, "foo", "")); + ASSERT_OK(Put(1, "bar", "")); // Generate four files in CF 0, which should trigger an auto compaction - Put("foo", ""); - Put("bar", ""); - Flush(); - Put("foo", ""); - Put("bar", ""); - Flush(); - Put("foo", ""); - Put("bar", ""); - Flush(); - Put("foo", ""); - Put("bar", ""); - Flush(); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "")); + ASSERT_OK(Put("bar", "")); + ASSERT_OK(Flush()); // The auto compaction is scheduled but waited until here TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1"); // The auto compaction will wait until the manual compaction is registerd // before processing so that it will be cancelled. - dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); ASSERT_EQ("0,1", FilesPerLevel(1)); // Eventually the cancelled compaction will be rescheduled and executed. - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -2623,7 +2664,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) { options.statistics->getTickerCount(BLOCK_CACHE_ADD); CompactRangeOptions cro; cro.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(cro, handles_[1], nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr)); // Verify manual compaction doesn't fill block cache ASSERT_EQ(prev_block_cache_add, options.statistics->getTickerCount(BLOCK_CACHE_ADD)); @@ -2704,7 +2745,8 @@ TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) { CompactRangeOptions compact_options; compact_options.target_path_id = 1; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; - db_->CompactRange(compact_options, handles_[1], nullptr, nullptr); + ASSERT_OK( + db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(1)); @@ -2763,8 +2805,8 @@ TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { ASSERT_OK(Put(1, ToString(key), rnd.RandomString(kTestValueSize))); } - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ColumnFamilyMetaData cf_meta; dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); @@ -2840,10 +2882,10 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { keys.emplace_back(rnd.RandomString(kKeySize)); values.emplace_back(rnd.RandomString(kKvSize - kKeySize)); ASSERT_OK(Put(Slice(keys[k]), Slice(values[k]))); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_FlushMemTable(true); + ASSERT_OK(dbfull()->TEST_FlushMemTable(true)); // Make sure the number of L0 files can trigger compaction. ASSERT_GE(NumTableFilesAtLevel(0), options.level0_file_num_compaction_trigger); @@ -2909,7 +2951,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { ASSERT_OK(Flush()); } // this should execute L0->L1 - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); // block compactions @@ -2926,7 +2968,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { sleeping_task.WaitUntilDone(); // this should execute L1->L2 (move) - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1", FilesPerLevel(0)); @@ -2944,7 +2986,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { ASSERT_OK(Flush()); } // this should execute both L0->L1 and L1->L2 (merge with previous file) - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,2", FilesPerLevel(0)); @@ -2952,6 +2994,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) { ASSERT_OK(env_->FileExists(dbname_ + moved_file_name)); listener->SetExpectedFileName(dbname_ + moved_file_name); + ASSERT_OK(iterator->status()); iterator.reset(); // this file should have been compacted away @@ -3114,7 +3157,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { for (int num = 0; num < 10; num++) { GenerateNewRandomFile(&rnd); } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"CompactionJob::Run():Start", @@ -3135,7 +3178,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"); GenerateNewRandomFile(&rnd, /* nowait */ true); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr)); for (int num = 0; num < options.level0_file_num_compaction_trigger + 1; num++) { @@ -3145,7 +3188,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { TEST_SYNC_POINT( "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } static std::string ShortKey(int i) { @@ -3270,7 +3313,7 @@ TEST_P(DBCompactionTestWithParam, IntraL0Compaction) { table_options.block_cache = NewLRUCache(64 << 20); // 64MB table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -3309,7 +3352,7 @@ TEST_P(DBCompactionTestWithParam, IntraL0Compaction) { } ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); std::vector> level_to_files; @@ -3387,7 +3430,7 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) { ASSERT_OK(Put(Key(i + 1), value)); ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); std::vector> level_to_files; @@ -3435,7 +3478,7 @@ TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) { int key_idx = 0; GenerateNewFile(&rnd, &key_idx); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, num_bottom_pri_compactions); @@ -3459,8 +3502,8 @@ TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) { // So key 0, 2, and 4+ fall outside these levels' key-ranges. for (int level = 2; level >= 1; --level) { for (int i = 0; i < 2; ++i) { - Put(Key(2 * i + 1), "val"); - Flush(); + ASSERT_OK(Put(Key(2 * i + 1), "val")); + ASSERT_OK(Flush()); } MoveFilesToLevel(level); ASSERT_EQ(2, NumTableFilesAtLevel(level)); @@ -3470,11 +3513,11 @@ TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) { // - Tombstones for keys 2 and 4 can be dropped early. // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges. for (int i = 0; i < kNumL0Files; ++i) { - Put(Key(0), "val"); // sentinel to prevent trivial move - Delete(Key(i + 1)); - Flush(); + ASSERT_OK(Put(Key(0), "val")); // sentinel to prevent trivial move + ASSERT_OK(Delete(Key(i + 1))); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = 0; i < kNumL0Files; ++i) { std::string value; @@ -3538,10 +3581,10 @@ TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) { TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) { // Regression test for bug of not pulling in L0 files that overlap the user- // specified input files in time- and key-ranges. - Put(Key(0), "old_val"); - Flush(); - Put(Key(0), "new_val"); - Flush(); + ASSERT_OK(Put(Key(0), "old_val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(0), "new_val")); + ASSERT_OK(Flush()); ColumnFamilyMetaData cf_meta; dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta); @@ -3557,6 +3600,41 @@ TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) { ASSERT_EQ("new_val", Get(Key(0))); } +TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + const Snapshot* snapshot = nullptr; + const int kMaxKey = 10; + + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + ASSERT_OK(Delete(Key(i))); + if (!snapshot) { + snapshot = db_->GetSnapshot(); + } + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey))); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // test DeleteFilesInRange() deletes the files already picked for compaction + SyncPoint::GetInstance()->LoadDependency( + {{"VersionSet::LogAndApply:WriteManifestStart", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction:Finish", + "VersionSet::LogAndApply:WriteManifestDone"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // release snapshot which mark bottommost file for compaction + db_->ReleaseSnapshot(snapshot); + std::string begin_string = Key(0); + std::string end_string = Key(kMaxKey + 1); + Slice begin(begin_string); + Slice end(end_string); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { // bottom-level files may contain deletions due to snapshots protecting the // deleted keys. Once the snapshot is released, we should see files with many @@ -3587,12 +3665,12 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { ASSERT_OK(Delete(Key(j))); } } - Flush(); + ASSERT_OK(Flush()); if (i < kNumLevelFiles - 1) { ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); std::vector pre_release_metadata, post_release_metadata; @@ -3613,7 +3691,7 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { CompactionReason::kBottommostFiles); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->GetLiveFilesMetaData(&post_release_metadata); ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); @@ -3662,12 +3740,12 @@ TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) { ASSERT_OK(Delete(Key(j))); } } - Flush(); + ASSERT_OK(Flush()); if (i < kNumLevelFiles - 1) { ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } } - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr)); ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1)); std::vector pre_release_metadata, post_release_metadata; @@ -3683,7 +3761,7 @@ TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) { [&](void* /*arg*/) { num_compactions.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); db_->ReleaseSnapshot(snapshot); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, num_compactions); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -3721,9 +3799,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { ASSERT_OK( Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); ASSERT_EQ("0,0,0,2", FilesPerLevel()); @@ -3732,9 +3810,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2,0,0,2", FilesPerLevel()); MoveFilesToLevel(1); ASSERT_EQ("0,2,0,2", FilesPerLevel()); @@ -3745,14 +3823,14 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { // Just do a simple write + flush so that the Ttl expired files get // compacted. ASSERT_OK(Put("a", "1")); - Flush(); + ASSERT_OK(Flush()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { Compaction* compaction = reinterpret_cast(arg); ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // All non-L0 files are deleted, as they contained only deleted data. ASSERT_EQ("1", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -3768,9 +3846,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { ASSERT_OK( Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); ASSERT_EQ("0,0,0,2", FilesPerLevel()); @@ -3779,9 +3857,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { for (int j = 0; j < kNumKeysPerFile; ++j) { ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2,0,0,2", FilesPerLevel()); MoveFilesToLevel(1); ASSERT_EQ("0,2,0,2", FilesPerLevel()); @@ -3790,8 +3868,8 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { // trigger as ttl is set to 24 hours. env_->MockSleepForSeconds(12 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,2,0,2", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -3804,7 +3882,7 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { // Dynamically change ttl to 10 hours. // This should trigger a ttl compaction, as 12 hours have already passed. ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // All non-L0 files are deleted, as they contained only deleted data. ASSERT_EQ("1", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -3864,7 +3942,7 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { for (int i = 1; i <= 100; ++i) { ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); // Get the first file's creation time. This will be the oldest file in the // DB. Compactions inolving this file's descendents should keep getting // this time. @@ -3877,7 +3955,7 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { for (int i = 101; i <= 200; ++i) { ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); MoveFilesToLevel(6); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); @@ -3886,12 +3964,12 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { for (int i = 1; i <= 50; ++i) { ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); env_->MockSleepForSeconds(1 * 60 * 60); for (int i = 51; i <= 150; ++i) { ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); MoveFilesToLevel(4); ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel()); @@ -3900,8 +3978,8 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { for (int i = 26; i <= 75; ++i) { ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(1); ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel()); @@ -3931,9 +4009,9 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { if (if_restart) { Reopen(options); } else { - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ(5, ttl_compactions); @@ -3946,9 +4024,9 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { if (if_restart) { Reopen(options); } else { - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); ASSERT_GE(ttl_compactions, 6); @@ -4013,9 +4091,9 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { ASSERT_OK( Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2", FilesPerLevel()); ASSERT_EQ(0, periodic_compactions); @@ -4023,8 +4101,8 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { // Add 50 hours and do a write env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Assert that the files stay in the same level ASSERT_EQ("3", FilesPerLevel()); // The two old files go through the periodic compaction process @@ -4039,9 +4117,9 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { if (if_restart) { Reopen(options); } else { - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("1,3", FilesPerLevel()); // The three old files now go through the periodic compaction process. 2 // + 3. @@ -4050,8 +4128,8 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { // Add another 50 hours and do another write env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("c", "3")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2,3", FilesPerLevel()); // The four old files now go through the periodic compaction process. 5 // + 4. @@ -4108,7 +4186,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { ASSERT_OK( Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); // Move the first two files to L2. if (i == 1) { MoveFilesToLevel(2); @@ -4172,9 +4250,9 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { ASSERT_OK( Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); MoveFilesToLevel(3); @@ -4185,8 +4263,8 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { // Add some time greater than periodic_compaction_time. env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Files in the bottom level go through periodic compactions. ASSERT_EQ("1,0,0,2", FilesPerLevel()); ASSERT_EQ(2, periodic_compactions); @@ -4195,8 +4273,8 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { // Add a little more time than ttl env_->MockSleepForSeconds(11 * 60 * 60); ASSERT_OK(Put("b", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Notice that the previous file in level 1 falls down to the bottom level // due to ttl compactions, one level at a time. // And bottom level files don't get picked up for ttl compactions. @@ -4207,8 +4285,8 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { // Add some time greater than periodic_compaction_time. env_->MockSleepForSeconds(50 * 60 * 60); ASSERT_OK(Put("c", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Previous L0 file falls one level at a time to bottom level due to ttl. // And all 4 bottom files go through periodic compactions. ASSERT_EQ("1,0,0,4", FilesPerLevel()); @@ -4284,9 +4362,9 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { ASSERT_OK( Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); } - Flush(); + ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("2", FilesPerLevel()); ASSERT_EQ(0, periodic_compactions); @@ -4294,8 +4372,8 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { // Add 31 days and do a write env_->MockSleepForSeconds(31 * 24 * 60 * 60); ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Assert that the files stay in the same level ASSERT_EQ("3", FilesPerLevel()); // The two old files go through the periodic compaction process @@ -4344,16 +4422,16 @@ TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) { for (int k = 0; k < 2; ++k) { ASSERT_OK(Put(Key(k), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; cro.allow_write_stall = false; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); }); manual_compaction_thread.join(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -4400,17 +4478,17 @@ TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) { FlushOptions flush_opts; flush_opts.wait = false; flush_opts.allow_write_stall = true; - dbfull()->Flush(flush_opts); + ASSERT_OK(dbfull()->Flush(flush_opts)); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; cro.allow_write_stall = false; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); }); manual_compaction_thread.join(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -4446,12 +4524,11 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { for (int k = 0; k < 2; ++k) { ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024))); } - Flush(1); + ASSERT_OK(Flush(1)); } auto manual_compaction_thread = port::Thread([this, i]() { CompactRangeOptions cro; cro.allow_write_stall = false; - Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr); if (i == 0) { ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) .IsColumnFamilyDropped()); @@ -4471,7 +4548,7 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { manual_compaction_thread.join(); TEST_SYNC_POINT( "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } } @@ -4506,25 +4583,26 @@ TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { for (int j = 0; j < 2; ++j) { ASSERT_OK(Put(Key(j), rnd.RandomString(1024))); } - dbfull()->Flush(flush_opts); + ASSERT_OK(dbfull()->Flush(flush_opts)); } auto manual_compaction_thread = port::Thread([this]() { CompactRangeOptions cro; cro.allow_write_stall = false; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); }); TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"); - Put(ToString(0), rnd.RandomString(1024)); - dbfull()->Flush(flush_opts); - Put(ToString(0), rnd.RandomString(1024)); + ASSERT_OK(Put(ToString(0), rnd.RandomString(1024))); + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(Put(ToString(0), rnd.RandomString(1024))); TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); manual_compaction_thread.join(); // If CompactRange's flush was skipped, the final Put above will still be // in the active memtable. std::string num_keys_in_memtable; - db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable); + ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, + &num_keys_in_memtable)); ASSERT_EQ(ToString(1), num_keys_in_memtable); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -4582,7 +4660,7 @@ TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) { } else { ASSERT_EQ(2, num_memtable_entries); // flush anyways to prepare for next iteration - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } } } @@ -4597,12 +4675,12 @@ TEST_F(DBCompactionTest, CompactionStatsTest) { for (int i = 0; i < 32; i++) { for (int j = 0; j < 5000; j++) { - Put(std::to_string(j), std::string(1, 'A')); + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); ColumnFamilyData* cfd = cfh->cfd(); @@ -4687,7 +4765,7 @@ TEST_F(DBCompactionTest, CompactionHasEmptyOutput) { ASSERT_OK(Delete("b")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(1), 0); @@ -4820,7 +4898,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { } for (unsigned int cf = 0; cf < cf_count; cf++) { - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); } } @@ -4838,7 +4916,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { } // put extra key to trigger flush ASSERT_OK(Put(0, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, NumTableFilesAtLevel(0, 0)); } @@ -4853,7 +4931,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { } for (unsigned int cf = 0; cf < cf_count; cf++) { - dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -4875,7 +4953,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { // put extra key to trigger flush ASSERT_OK(Put(cf_test, "", "")); - dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test])); ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test)); Compact(cf_test, Key(0), Key(keyIndex)); @@ -4961,7 +5039,7 @@ TEST_P(CompactionPriTest, Test) { ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102))); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); for (int i = 0; i < kNKeys; i++) { ASSERT_NE("NOT_FOUND", Get(Key(i))); } @@ -5000,9 +5078,9 @@ TEST_F(DBCompactionTest, PartialManualCompaction) { Random rnd(301); for (auto i = 0; i < 8; ++i) { for (auto j = 0; j < 10; ++j) { - Merge("foo", rnd.RandomString(1024)); + ASSERT_OK(Merge("foo", rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } MoveFilesToLevel(2); @@ -5015,7 +5093,7 @@ TEST_F(DBCompactionTest, PartialManualCompaction) { CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); } TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { @@ -5023,7 +5101,7 @@ TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { // is in read-only mode. Verify it now at least returns, despite failing. const int kNumL0Files = 4; std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options opts = CurrentOptions(); opts.disable_auto_compactions = true; opts.env = mock_env.get(); @@ -5032,9 +5110,9 @@ TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) { Random rnd(301); for (int i = 0; i < kNumL0Files; ++i) { // Make sure files are overlapping in key-range to prevent trivial move. - Put("key1", rnd.RandomString(1024)); - Put("key2", rnd.RandomString(1024)); - Flush(); + ASSERT_OK(Put("key1", rnd.RandomString(1024))); + ASSERT_OK(Put("key2", rnd.RandomString(1024))); + ASSERT_OK(Flush()); } ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0)); @@ -5083,7 +5161,7 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { ASSERT_OK( Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } MoveFilesToLevel(2); @@ -5093,7 +5171,7 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { ASSERT_OK( Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } const std::vector& comp_stats = internal_stats_ptr->TEST_GetCompactionStats(); @@ -5102,7 +5180,7 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); const std::vector& comp_stats2 = internal_stats_ptr->TEST_GetCompactionStats(); @@ -5110,6 +5188,94 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { ASSERT_EQ(num, 0); } +TEST_F(DBCompactionTest, ManualCompactionMax) { + uint64_t l1_avg_size = 0, l2_avg_size = 0; + auto generate_sst_func = [&]() { + Random rnd(301); + for (auto i = 0; i < 100; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 10; j++) { + ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + + uint64_t total = 0; + for (const auto& file : level_to_files[1]) { + total += file.compensated_file_size; + } + l1_avg_size = total / level_to_files[1].size(); + + total = 0; + for (const auto& file : level_to_files[2]) { + total += file.compensated_file_size; + } + l2_avg_size = total / level_to_files[2].size(); + }; + + std::atomic_int num_compactions(0); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + + // with default setting (1.6G by default), it should cover all files in 1 + // compaction + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == 1); + + // split the compaction to 5 + int num_split = 5; + DestroyAndReopen(opts); + generate_sst_func(); + uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + opts.max_compaction_bytes = total_size / num_split; + Reopen(opts); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); + + // very small max_compaction_bytes, it should still move forward + opts.max_compaction_bytes = l1_avg_size / 2; + DestroyAndReopen(opts); + generate_sst_func(); + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() > 10); + + // dynamically set the option + num_split = 2; + opts.max_compaction_bytes = 0; + DestroyAndReopen(opts); + generate_sst_func(); + total_size = (l1_avg_size * 10) + (l2_avg_size * 100); + Status s = db_->SetOptions( + {{"max_compaction_bytes", std::to_string(total_size / num_split)}}); + ASSERT_OK(s); + + num_compactions.store(0); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(num_compactions.load() == num_split); +} + TEST_F(DBCompactionTest, CompactionDuringShutdown) { Options opts = CurrentOptions(); opts.level0_file_num_compaction_trigger = 2; @@ -5127,14 +5293,15 @@ TEST_F(DBCompactionTest, CompactionDuringShutdown) { ASSERT_OK( Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); } - Flush(); + ASSERT_OK(Flush()); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsShutdownInProgress()); ASSERT_OK(dbfull()->error_handler_.GetBGError()); } @@ -5148,7 +5315,7 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { // Generate an external SST file containing a single key, i.e. 99 std::string sst_files_dir = dbname_ + "/sst_files/"; - DestroyDir(env_, sst_files_dir); + ASSERT_OK(DestroyDir(env_, sst_files_dir)); ASSERT_OK(env_->CreateDir(sst_files_dir)); SstFileWriter sst_writer(EnvOptions(), options); const std::string sst_file_path = sst_files_dir + "test.sst"; @@ -5187,7 +5354,7 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { // extra key to trigger flush. ASSERT_OK(Put("", "")); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1); } // When we reach this point, there will be level0_stop_writes_trigger L0 @@ -5235,7 +5402,12 @@ TEST_F(DBCompactionTest, ConsistencyFailTest) { for (int k = 0; k < 2; ++k) { ASSERT_OK(Put("foo", "bar")); - Flush(); + Status s = Flush(); + if (k < 1) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsCorruption()); + } } ASSERT_NOK(Put("foo", "bar")); @@ -5250,7 +5422,7 @@ TEST_F(DBCompactionTest, ConsistencyFailTest2) { options.level0_file_num_compaction_trigger = 2; BlockBasedTableOptions bbto; bbto.block_size = 400; // small block size - options.table_factory.reset(new BlockBasedTableFactory(bbto)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -5271,14 +5443,15 @@ TEST_F(DBCompactionTest, ConsistencyFailTest2) { ASSERT_OK(Put("foo1", value)); ASSERT_OK(Put("z", "")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("foo2", value)); ASSERT_OK(Put("z", "")); - Flush(); + Status s = Flush(); + ASSERT_TRUE(s.ok() || s.IsCorruption()); // This probably returns non-OK, but we rely on the next Put() // to determine the DB is frozen. - dbfull()->TEST_WaitForCompact(); + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); ASSERT_NOK(Put("foo", "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -5364,7 +5537,7 @@ TEST_P(DBCompactionTestWithParam, // Put one key, to make biggest log sequence number in this memtable is bigger // than sst which would be ingested in next step. ASSERT_OK(Put(Key(2), "b")); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); std::vector> level_to_files; dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), @@ -5452,7 +5625,7 @@ TEST_P(DBCompactionTestWithParam, // Wake up flush job sleeping_tasks.WakeUp(); sleeping_tasks.WaitUntilDone(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); uint64_t error_count = 0; @@ -5478,7 +5651,7 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { for (int j = 1; j < UCHAR_MAX; j++) { auto key = std::string(kSstNum, '\0'); key[kSstNum - i] += static_cast(j); - Put(key, std::string(i % 1000, 'A')); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); } ASSERT_OK(Flush()); } @@ -5488,7 +5661,7 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { auto cro = CompactRangeOptions(); cro.bottommost_level_compaction = bottommost_level_compaction_; - db_->CompactRange(cro, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || bottommost_level_compaction_ == BottommostLevelCompaction::kForceOptimized) { @@ -5527,12 +5700,12 @@ TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { // Trigger compaction for (int i = 0; i < 32; i++) { for (int j = 0; j < 5000; j++) { - Put(std::to_string(j), std::string(1, 'A')); + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(has_compaction); has_compaction = false; @@ -5550,12 +5723,12 @@ TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { // Trigger compaction for (int i = 0; i < 32; i++) { for (int j = 0; j < 5000; j++) { - Put(std::to_string(j), std::string(1, 'A')); + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(has_compaction); } @@ -5578,12 +5751,12 @@ TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { // Trigger compaction for (int i = 0; i < 32; i++) { for (int j = 0; j < 5000; j++) { - Put(std::to_string(j), std::string(1, 'A')); + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(has_compaction); has_compaction = false; @@ -5601,12 +5774,12 @@ TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) { // Trigger compaction for (int i = 0; i < 32; i++) { for (int j = 0; j < 5000; j++) { - Put(std::to_string(j), std::string(1, 'A')); + ASSERT_OK(Put(std::to_string(j), std::string(1, 'A'))); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(has_compaction); } @@ -5654,7 +5827,7 @@ TEST_P(ChangeLevelConflictsWithAuto, TestConflict) { ASSERT_OK(Put("bar", "v3")); ASSERT_OK(Put("foo", "v3")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); TEST_SYNC_POINT("AutoCompactionFinished2"); }); @@ -5664,7 +5837,7 @@ TEST_P(ChangeLevelConflictsWithAuto, TestConflict) { cro.target_level = GetParam() ? 1 : 0; // This should return non-OK, but it's more important for the test to // make sure that the DB is not corrupted. - dbfull()->CompactRange(cro, nullptr, nullptr); + ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr)); } auto_comp.join(); // Refitting didn't happen. @@ -5845,6 +6018,734 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { ASSERT_EQ("0,5", FilesPerLevel(0)); } +TEST_F(DBCompactionTest, CompactionWithBlob) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), third_value); + ASSERT_EQ(Get(second_key), third_value); + + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_EQ(l1_files.size(), 1); + + const FileMetaData* const table_file = l1_files[0]; + ASSERT_NE(table_file, nullptr); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_EQ(blob_files.size(), 1); + + const auto& blob_file = blob_files.begin()->second; + ASSERT_NE(blob_file, nullptr); + + ASSERT_EQ(table_file->smallest.user_key(), first_key); + ASSERT_EQ(table_file->largest.user_key(), second_key); + ASSERT_EQ(table_file->oldest_blob_file_number, + blob_file->GetBlobFileNumber()); + + ASSERT_EQ(blob_file->GetTotalBlobCount(), 2); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1); +} + +class DBCompactionTestBlobError + : public DBCompactionTest, + public testing::WithParamInterface { + public: + DBCompactionTestBlobError() : sync_point_(GetParam()) {} + + std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError, + ::testing::ValuesIn(std::vector{ + "BlobFileBuilder::WriteBlobToFile:AddRecord", + "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); + +TEST_P(DBCompactionTestBlobError, CompactionError) { + Options options; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char second_key[] = "second_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_value[] = "second_value"; + constexpr char third_value[] = "third_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, first_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, second_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(first_key, third_value)); + ASSERT_OK(Put(second_key, third_value)); + ASSERT_OK(Flush()); + + options.enable_blob_files = true; + + Reopen(options); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l1_files = storage_info->LevelFiles(1); + ASSERT_TRUE(l1_files.empty()); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_TRUE(blob_files.empty()); + + const InternalStats* const internal_stats = cfd->internal_stats(); + ASSERT_NE(internal_stats, nullptr); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 0); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_GT(compaction_stats[1].bytes_written, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].num_output_files, 1); + ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0); + } +} + +class DBCompactionTestBlobGC + : public DBCompactionTest, + public testing::WithParamInterface> { + public: + DBCompactionTestBlobGC() + : blob_gc_age_cutoff_(std::get<0>(GetParam())), + updated_enable_blob_files_(std::get<1>(GetParam())) {} + + double blob_gc_age_cutoff_; + bool updated_enable_blob_files_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC, + ::testing::Combine(::testing::Values(0.0, 0.5, 1.0), + ::testing::Bool())); + +TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.blob_file_size = 32; // one blob per file + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 4); + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + size_t expected_number_of_files = original_blob_files.size(); + + if (!updated_enable_blob_files_) { + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + + expected_number_of_files -= cutoff_index; + } + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + assert(versions); + assert(versions->GetColumnFamilySet()); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + const InternalStats* const internal_stats = cfd->internal_stats(); + assert(internal_stats); + + const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); + ASSERT_GE(compaction_stats.size(), 2); + + if (blob_gc_age_cutoff_ > 0.0) { + ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); + + if (updated_enable_blob_files_) { + // GC relocated some blobs to new blob files + ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_read_blob, + compaction_stats[1].bytes_written_blob); + } else { + // GC moved some blobs back to the LSM, no new blob files + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } + } else { + ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); + ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); + } +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char corrupt_blob_index[] = "foobar"; + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, + corrupt_blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) { + constexpr uint64_t min_blob_size = 10; + + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + constexpr char blob[] = "short"; + static_assert(sizeof(short) - 1 < min_blob_size, + "Blob too long to be inlined"); + + // Fake an inlined TTL blob index. + std::string blob_index; + + constexpr uint64_t expiration = 1234567890; + + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 1.0; + + Reopen(options); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + ASSERT_OK(Put(first_key, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + ASSERT_OK(Put(second_key, second_value)); + + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + ASSERT_OK(Put(third_key, third_value)); + + constexpr char fourth_key[] = "fourth_key"; + + // Fake a blob index referencing a non-existent blob file. + std::string blob_index; + + constexpr uint64_t blob_file_number = 1000; + constexpr uint64_t offset = 1234; + constexpr uint64_t size = 5678; + + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + + WriteBatch batch; + ASSERT_OK( + WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + ASSERT_OK(Flush()); + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_TRUE( + db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption()); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + Status s; + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // options is not set, the checksum handoff will not be triggered + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + Destroy(options); + Reopen(options); + + // The hash does not match, compaction write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 3; + options.env = fault_fs_env.get(); + options.create_if_missing = true; + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + Status s; + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s, Status::OK()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put(Key(0), "value1")); + ASSERT_OK(Put(Key(2), "value2")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BackgroundCallCompaction:0", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "value3")); + s = Flush(); + ASSERT_EQ(s, Status::OK()); + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + #endif // !defined(ROCKSDB_LITE) } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc index 6ecf727c91e..6e2f6a283e7 100644 --- a/db/db_dynamic_level_test.cc +++ b/db/db_dynamic_level_test.cc @@ -13,6 +13,7 @@ #if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" +#include "env/mock_env.h" #include "port/port.h" #include "port/stack_trace.h" #include "util/random.h" @@ -102,7 +103,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) { } // Test compact range works - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // All data should be in the last level. ColumnFamilyMetaData cf_meta; db_->GetColumnFamilyMetaData(&cf_meta); @@ -141,6 +143,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { options.max_background_compactions = 2; options.num_levels = 5; options.max_compaction_bytes = 0; // Force not expanding in compactions + options.db_host_id = ""; // Setting this messes up the file size calculation BlockBasedTableOptions table_options; table_options.block_size = 1024; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -165,8 +168,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, })); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(4U, int_prop); @@ -183,8 +186,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, })); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(3U, int_prop); ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop)); @@ -204,8 +207,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { ASSERT_OK(dbfull()->SetOptions({ {"disable_auto_compactions", "false"}, })); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(3U, int_prop); @@ -233,8 +236,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { })); TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0"); - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); ASSERT_EQ(2U, int_prop); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -263,7 +266,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) { } TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2"); - Flush(); + ASSERT_OK(Flush()); thread.join(); @@ -301,7 +304,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) { DestroyAndReopen(options); // Compact against empty DB - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); uint64_t int_prop; std::string str_prop; @@ -315,13 +318,13 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) { ASSERT_OK( Put(Key(static_cast(rnd.Uniform(kMaxKey))), rnd.RandomString(80))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0) == 0) { // Make sure level 0 is not empty ASSERT_OK( Put(Key(static_cast(rnd.Uniform(kMaxKey))), rnd.RandomString(80))); - Flush(); + ASSERT_OK(Flush()); } ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop)); @@ -342,7 +345,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) { }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(output_levels.size(), 2); ASSERT_TRUE(output_levels.find(3) != output_levels.end()); ASSERT_TRUE(output_levels.find(4) != output_levels.end()); @@ -388,8 +391,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) { PutFixed32(&value, static_cast(i)); ASSERT_OK(Put(Key(i), value)); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(non_trivial, 0); @@ -448,7 +451,7 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) { ASSERT_OK(Delete(Key(i / 10))); } verify_func(total_keys, false); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); options.level_compaction_dynamic_level_bytes = true; options.disable_auto_compactions = true; @@ -463,7 +466,7 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = options.num_levels - 1; - dbfull()->CompactRange(compact_options, nullptr, nullptr); + ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); compaction_finished.store(true); }); do { @@ -483,7 +486,7 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) { } verify_func(total_keys2, false); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); verify_func(total_keys2, false); // Base level is not level 1 diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc index dddc8330713..c2ec3ec8a52 100644 --- a/db/db_encryption_test.cc +++ b/db/db_encryption_test.cc @@ -18,6 +18,13 @@ class DBEncryptionTest : public DBTestBase { public: DBEncryptionTest() : DBTestBase("/db_encryption_test", /*env_do_fsync=*/true) {} + Env* GetTargetEnv() { + if (encrypted_env_ != nullptr) { + return (static_cast(encrypted_env_))->target(); + } else { + return env_; + } + } }; #ifndef ROCKSDB_LITE @@ -34,20 +41,20 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { auto status = env_->GetChildren(dbname_, &fileNames); ASSERT_OK(status); - auto defaultEnv = Env::Default(); + Env* target = GetTargetEnv(); int hits = 0; for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) { - if ((*it == "..") || (*it == ".")) { + if (*it == "LOCK") { continue; } auto filePath = dbname_ + "/" + *it; std::unique_ptr seqFile; auto envOptions = EnvOptions(CurrentOptions()); - status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions); + status = target->NewSequentialFile(filePath, &seqFile, envOptions); ASSERT_OK(status); uint64_t fileSize; - status = defaultEnv->GetFileSize(filePath, &fileSize); + status = target->GetFileSize(filePath, &fileSize); ASSERT_OK(status); std::string scratch; @@ -85,7 +92,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { } TEST_F(DBEncryptionTest, ReadEmptyFile) { - auto defaultEnv = Env::Default(); + auto defaultEnv = GetTargetEnv(); // create empty file for reading it back in later auto envOptions = EnvOptions(CurrentOptions()); diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 35b8f648e04..fce28c02cc9 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -98,7 +98,14 @@ Status DBImpl::GetLiveFiles(std::vector& ret, ret.emplace_back(CurrentFileName("")); ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number())); - ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + // The OPTIONS file number is zero in read-write mode when OPTIONS file + // writing failed and the DB was configured with + // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file + // number is zero when no OPTIONS file exist at all. In those cases we do not + // record any OPTIONS file in the live file list. + if (versions_->options_file_number() != 0) { + ret.emplace_back(OptionsFileName("", versions_->options_file_number())); + } // find length of manifest file while holding the mutex lock *manifest_file_size = versions_->manifest_file_size(); diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 529acbc51dc..b5d3026d86e 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -11,13 +11,16 @@ #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "env/mock_env.h" #include "file/filename.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/utilities/transaction_db.h" #include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/mutexlock.h" #include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -63,7 +66,7 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) { ASSERT_OK(Put("bar", "v")); ASSERT_OK(dbfull()->Flush(no_wait)); // If the issue is hit we will wait here forever. - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE ASSERT_EQ(2, TotalTableFiles()); #endif // ROCKSDB_LITE @@ -79,41 +82,26 @@ TEST_F(DBFlushTest, SyncFail) { options.env = fault_injection_env.get(); SyncPoint::GetInstance()->LoadDependency( - {{"DBFlushTest::SyncFail:GetVersionRefCount:1", - "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"}, - {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", - "DBFlushTest::SyncFail:GetVersionRefCount:2"}, - {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"}, + {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"}, {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}}); SyncPoint::GetInstance()->EnableProcessing(); CreateAndReopenWithCF({"pikachu"}, options); - Put("key", "value"); - auto* cfd = - static_cast_with_check(db_->DefaultColumnFamily()) - ->cfd(); + ASSERT_OK(Put("key", "value")); FlushOptions flush_options; flush_options.wait = false; ASSERT_OK(dbfull()->Flush(flush_options)); // Flush installs a new super-version. Get the ref count after that. - auto current_before = cfd->current(); - int refs_before = cfd->current()->TEST_refs(); - TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1"); - TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2"); - int refs_after_picking_memtables = cfd->current()->TEST_refs(); - ASSERT_EQ(refs_before + 1, refs_after_picking_memtables); fault_injection_env->SetFilesystemActive(false); TEST_SYNC_POINT("DBFlushTest::SyncFail:1"); TEST_SYNC_POINT("DBFlushTest::SyncFail:2"); fault_injection_env->SetFilesystemActive(true); // Now the background job will do the flush; wait for it. - dbfull()->TEST_WaitForFlushMemTable(); + // Returns the IO error happend during flush. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE ASSERT_EQ("", FilesPerLevel()); // flush failed. #endif // ROCKSDB_LITE - // Backgroun flush job should release ref count to current version. - ASSERT_EQ(current_before, cfd->current()); - ASSERT_EQ(refs_before, cfd->current()->TEST_refs()); Destroy(options); } @@ -126,7 +114,7 @@ TEST_F(DBFlushTest, SyncSkip) { SyncPoint::GetInstance()->EnableProcessing(); Reopen(options); - Put("key", "value"); + ASSERT_OK(Put("key", "value")); FlushOptions flush_options; flush_options.wait = false; @@ -136,7 +124,7 @@ TEST_F(DBFlushTest, SyncSkip) { TEST_SYNC_POINT("DBFlushTest::SyncSkip:2"); // Now the background job will do the flush; wait for it. - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); Destroy(options); } @@ -171,13 +159,73 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) { ASSERT_OK(Put("key", "val")); for (int i = 0; i < 4; ++i) { ASSERT_OK(Put("key", "val")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(4, num_flushes); ASSERT_EQ(1, num_compactions); } +// Test when flush job is submitted to low priority thread pool and when DB is +// closed in the meanwhile, CloseHelper doesn't hang. +TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) { + Options options = CurrentOptions(); + options.max_background_flushes = 1; + options.max_total_wal_size = 8192; + + DestroyAndReopen(options); + CreateColumnFamilies({"cf1", "cf2"}, options); + + env_->SetBackgroundThreads(0, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + int num_flushes = 0; + + SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush", + [&](void* /*arg*/) { ++num_flushes; }); + + int num_low_flush_unscheduled = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) { + num_low_flush_unscheduled++; + // There should be one flush job in low pool that needs to be + // unscheduled + ASSERT_EQ(num_low_flush_unscheduled, 1); + }); + + int num_high_flush_unscheduled = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) { + num_high_flush_unscheduled++; + // There should be no flush job in high pool + ASSERT_EQ(num_high_flush_unscheduled, 0); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(0, "key1", DummyString(8192))); + // Block thread so that flush cannot be run and can be removed from the queue + // when called Unschedule. + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + sleeping_task_low.WaitUntilSleeping(); + + // Trigger flush and flush job will be scheduled to LOW priority thread. + ASSERT_OK(Put(0, "key2", DummyString(8192))); + + // Close DB and flush job in low priority queue will be removed without + // running. + Close(); + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + ASSERT_EQ(0, num_flushes); + + TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options); + ASSERT_OK(Put(0, "key3", DummyString(8192))); + ASSERT_OK(Flush(0)); + ASSERT_EQ(1, num_flushes); +} + TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) { Options options = CurrentOptions(); options.write_buffer_size = 100; @@ -237,6 +285,379 @@ TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) { SyncPoint::GetInstance()->ClearAllCallBacks(); } +// The following 3 tests are designed for testing garbage statistics at flush +// time. +// +// ======= General Information ======= (from GitHub Wiki). +// There are three scenarios where memtable flush can be triggered: +// +// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size +// after a write. +// 2 - Total memtable size across all column families exceeds +// DBOptions::db_write_buffer_size, +// or DBOptions::write_buffer_manager signals a flush. In this scenario +// the largest memtable will be flushed. +// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size. +// In this scenario the memtable with the oldest data will be flushed, +// in order to allow the WAL file with data from this memtable to be +// purged. +// +// As a result, a memtable can be flushed before it is full. This is one +// reason the generated SST file can be smaller than the corresponding +// memtable. Compression is another factor to make SST file smaller than +// corresponding memtable, since data in memtable is uncompressed. + +TEST_F(DBFlushTest, StatisticsGarbageBasic) { + Options options = CurrentOptions(); + + // The following options are used to enforce several values that + // may already exist as default values to make this test resilient + // to default value updates in the future. + options.statistics = CreateDBStatistics(); + + // Record all statistics. + options.statistics->set_stats_level(StatsLevel::kAll); + + // create the DB if it's not already present + options.create_if_missing = true; + + // Useful for now as we are trying to compare uncompressed data savings on + // flush(). + options.compression = kNoCompression; + + // Prevent memtable in place updates. Should already be disabled + // (from Wiki: + // In place updates can be enabled by toggling on the bool + // inplace_update_support flag. However, this flag is by default set to + // false + // because this thread-safe in-place update support is not compatible + // with concurrent memtable writes. Note that the bool + // allow_concurrent_memtable_write is set to true by default ) + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + + // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). + options.write_buffer_size = 64 << 20; + + ASSERT_OK(TryReopen(options)); + + // Put multiple times the same key-values. + // The encoded length of a db entry in the memtable is + // defined in db/memtable.cc (MemTable::Add) as the variable: + // encoded_len= VarintLength(internal_key_size) --> = + // log_256(internal_key). + // Min # of bytes + // necessary to + // store + // internal_key_size. + // + internal_key_size --> = actual key string, + // (size key_size: w/o term null char) + // + 8 bytes for + // fixed uint64 "seq + // number + // + + // insertion type" + // + VarintLength(val_size) --> = min # of bytes to + // store val_size + // + val_size --> = actual value + // string + // For example, in our situation, "key1" : size 4, "value1" : size 6 + // (the terminating null characters are not copied over to the memtable). + // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry. + // However in terms of raw data contained in the memtable, and written + // over to the SSTable, we only count internal_key_size and val_size, + // because this is the only raw chunk of bytes that contains everything + // necessary to reconstruct a user entry: sequence number, insertion type, + // key, and value. + + // To test the relevance of our Memtable garbage statistics, + // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + // we insert K-V pairs with 3 distinct keys (of length 4), + // and random values of arbitrary length RAND_VALUES_LENGTH, + // and we repeat this step NUM_REPEAT times total. + // At the end, we insert 3 final K-V pairs with the same 3 keys + // and known values (these will be the final values, of length 6). + // I chose NUM_REPEAT=2,000 such that no automatic flush is + // triggered (the number of bytes in the memtable is therefore + // well below any meaningful heuristic for a memtable of size 64MB). + // As a result, since each K-V pair is inserted as a payload + // of N meaningful bytes (sequence number, insertion type, + // key, and value = 8 + 4 + RAND_VALUE_LENGTH), + // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes + // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH + + // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we + // expect: + // N = 8 + 4 + 172 = 184 bytes + // MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes. + // MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes. + + const size_t NUM_REPEAT = 2000; + const size_t RAND_VALUES_LENGTH = 172; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string VALUE1 = "value1"; + const std::string VALUE2 = "value2"; + const std::string VALUE3 = "value3"; + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + ASSERT_OK(Put(KEY1, VALUE1)); + ASSERT_OK(Put(KEY2, VALUE2)); + ASSERT_OK(Put(KEY3, VALUE3)); + + // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() + + VALUE3.size() + 3 * sizeof(uint64_t); + + // We assert that the last K-V pairs have been successfully inserted, + // and that the valid values are VALUE1, VALUE2, VALUE3. + PinnableSlice value; + ASSERT_OK(Get(KEY1, &value)); + ASSERT_EQ(value.ToString(), VALUE1); + ASSERT_OK(Get(KEY2, &value)); + ASSERT_EQ(value.ToString(), VALUE2); + ASSERT_OK(Get(KEY3, &value)); + ASSERT_EQ(value.ToString(), VALUE3); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + options.write_buffer_size = 67108864; + + ASSERT_OK(TryReopen(options)); + + const size_t NUM_REPEAT = 2000; + const size_t RAND_VALUES_LENGTH = 37; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string KEY4 = "key4"; + const std::string KEY5 = "key5"; + const std::string KEY6 = "key6"; + + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + WriteBatch batch; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + ASSERT_OK(Delete(KEY1)); + ASSERT_OK(Delete(KEY2)); + ASSERT_OK(Delete(KEY3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + // Note : one set of delete for KEY1, KEY2, KEY3 is written to + // SSTable to propagate the delete operations to K-V pairs + // that could have been inserted into the database during past Flush + // opeartions. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= + KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t); + + // Additional useful paylaod. + ASSERT_OK(Delete(KEY4)); + ASSERT_OK(Delete(KEY5)); + ASSERT_OK(Delete(KEY6)); + + // // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t); + + // We assert that the K-V pairs have been successfully deleted. + PinnableSlice value; + ASSERT_NOK(Get(KEY1, &value)); + ASSERT_NOK(Get(KEY2, &value)); + ASSERT_NOK(Get(KEY3, &value)); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + +TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) { + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + options.create_if_missing = true; + options.compression = kNoCompression; + options.inplace_update_support = false; + options.allow_concurrent_memtable_write = true; + options.write_buffer_size = 67108864; + + ASSERT_OK(TryReopen(options)); + + const size_t NUM_REPEAT = 1000; + const size_t RAND_VALUES_LENGTH = 42; + const std::string KEY1 = "key1"; + const std::string KEY2 = "key2"; + const std::string KEY3 = "key3"; + const std::string KEY4 = "key4"; + const std::string KEY5 = "key5"; + const std::string KEY6 = "key6"; + const std::string VALUE3 = "value3"; + + uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0; + uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0; + + Random rnd(301); + // Insertion of of K-V pairs, multiple times. + // Also insert DeleteRange + for (size_t i = 0; i < NUM_REPEAT; i++) { + // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes. + std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH); + std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH); + ASSERT_OK(Put(KEY1, p_v1)); + ASSERT_OK(Put(KEY2, p_v2)); + ASSERT_OK(Put(KEY3, p_v3)); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY1.size() + p_v1.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY2.size() + p_v2.size() + sizeof(uint64_t); + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + KEY3.size() + p_v3.size() + sizeof(uint64_t); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1, + KEY2)); + // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3) + // is deleted. + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2, + KEY3)); + // Delete ranges are stored as a regular K-V pair, with key=STARTKEY, + // value=ENDKEY. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH += + (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); + } + + // The memtable data bytes includes the "garbage" + // bytes along with the useful payload. + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH; + + // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written + // to SSTable to propagate the deleteRange operations to K-V pairs that could + // have been inserted into the database during past Flush opeartions. + EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -= + (KEY1.size() + KEY2.size() + sizeof(uint64_t)) + + (KEY2.size() + KEY3.size() + sizeof(uint64_t)); + + // Overwrite KEY3 with known value (VALUE3) + // Note that during the whole time KEY3 has never been deleted + // by the RangeDeletes. + ASSERT_OK(Put(KEY3, VALUE3)); + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + KEY3.size() + VALUE3.size() + sizeof(uint64_t); + + // Additional useful paylaod. + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5)); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6)); + + // Add useful payload to the memtable data bytes: + EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH += + (KEY4.size() + KEY5.size() + sizeof(uint64_t)) + + (KEY5.size() + KEY6.size() + sizeof(uint64_t)); + + // We assert that the K-V pairs have been successfully deleted. + PinnableSlice value; + ASSERT_NOK(Get(KEY1, &value)); + ASSERT_NOK(Get(KEY2, &value)); + // And that KEY3's value is correct. + ASSERT_OK(Get(KEY3, &value)); + ASSERT_EQ(value, VALUE3); + + // Force flush to SST. Increments the statistics counter. + ASSERT_OK(Flush()); + + // Collect statistics. + uint64_t mem_data_bytes = + TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + uint64_t mem_garbage_bytes = + TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH); + EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH); + + Close(); +} + TEST_P(DBFlushDirectIOTest, DirectIO) { Options options; options.create_if_missing = true; @@ -306,7 +727,8 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { // mode. fault_injection_env->SetFilesystemActive(false); ASSERT_OK(db_->ContinueBackgroundWork()); - dbfull()->TEST_WaitForFlushMemTable(); + // We ingested the error to env, so the returned status is not OK. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE uint64_t num_bg_errors; ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors, @@ -451,6 +873,7 @@ TEST_F(DBFlushTest, FlushWithBlob) { options.enable_blob_files = true; options.min_blob_size = min_blob_size; options.disable_auto_compactions = true; + options.env = env_; Reopen(options); @@ -468,9 +891,7 @@ TEST_F(DBFlushTest, FlushWithBlob) { ASSERT_OK(Flush()); ASSERT_EQ(Get("key1"), short_value); - - // TODO: enable once Get support is implemented for blobs - // ASSERT_EQ(Get("key2"), long_value); + ASSERT_EQ(Get("key2"), long_value); VersionSet* const versions = dbfull()->TEST_GetVersionSet(); assert(versions); @@ -509,26 +930,225 @@ TEST_F(DBFlushTest, FlushWithBlob) { const InternalStats* const internal_stats = cfd->internal_stats(); assert(internal_stats); - const uint64_t expected_bytes = - table_file->fd.GetFileSize() + blob_file->GetTotalBlobBytes(); - const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); ASSERT_FALSE(compaction_stats.empty()); - ASSERT_EQ(compaction_stats[0].bytes_written, expected_bytes); - ASSERT_EQ(compaction_stats[0].num_output_files, 2); + ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1); const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); - ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], expected_bytes); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); #endif // ROCKSDB_LITE } +TEST_F(DBFlushTest, FlushWithChecksumHandoff1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kTableFile); + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + // The hash does not match, write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is an + // unrecoverable error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->IngestDataCorruptionBeforeWrite(); + }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), + ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoff2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + Reopen(options); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + + // options is not set, the checksum handoff will not be triggered + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); + Reopen(options); + + // The file system does not support checksum handoff. The check + // will be ignored. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(Flush()); + + // options is not set, the checksum handoff will not be triggered + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { + fault_fs->IngestDataCorruptionBeforeWrite(); + }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + Reopen(options); + + ASSERT_OK(Put("key1", "value1")); + ASSERT_OK(Put("key2", "value2")); + ASSERT_OK(Flush()); + + // The hash does not match, write fails + // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + ASSERT_OK(Put("key3", "value3")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void*) { + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + }); + ASSERT_OK(Put("key3", "value3")); + ASSERT_OK(Put("key4", "value4")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + Destroy(options); +} + +TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + Options options = CurrentOptions(); + options.write_buffer_size = 100; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.disable_auto_compactions = true; + options.env = fault_fs_env.get(); + options.checksum_handoff_file_types.Add(FileType::kDescriptorFile); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + Reopen(options); + // The file system does not support checksum handoff. The check + // will be ignored. + ASSERT_OK(Put("key5", "value5")); + ASSERT_OK(Put("key6", "value6")); + ASSERT_OK(Flush()); + + // Each write will be similated as corrupted. + // Since the file system returns IOStatus::Corruption, it is mapped to + // kFatalError error. + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); }); + ASSERT_OK(Put("key7", "value7")); + ASSERT_OK(Put("key8", "value8")); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); + SyncPoint::GetInstance()->DisableProcessing(); + + Destroy(options); +} + class DBFlushTestBlobError : public DBFlushTest, public testing::WithParamInterface { public: - DBFlushTestBlobError() : fault_injection_env_(env_) {} - ~DBFlushTestBlobError() { Close(); } + DBFlushTestBlobError() : sync_point_(GetParam()) {} - FaultInjectionTestEnv fault_injection_env_; + std::string sync_point_; }; INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError, @@ -540,19 +1160,18 @@ TEST_P(DBFlushTestBlobError, FlushError) { Options options; options.enable_blob_files = true; options.disable_auto_compactions = true; - options.env = &fault_injection_env_; + options.env = env_; Reopen(options); ASSERT_OK(Put("key", "blob")); - SyncPoint::GetInstance()->SetCallBack(GetParam(), [this](void* /* arg */) { - fault_injection_env_.SetFilesystemActive(false, Status::IOError()); + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); }); - SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", [this](void* /* arg */) { - fault_injection_env_.SetFilesystemActive(true); - }); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_NOK(Flush()); @@ -599,14 +1218,117 @@ TEST_P(DBFlushTestBlobError, FlushError) { const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); ASSERT_FALSE(compaction_stats.empty()); - ASSERT_EQ(compaction_stats[0].bytes_written, 0); - ASSERT_EQ(compaction_stats[0].num_output_files, 0); + + if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") { + ASSERT_EQ(compaction_stats[0].bytes_written, 0); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[0].num_output_files, 0); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0); + } else { + // SST file writing succeeded; blob file writing failed (during Finish) + ASSERT_GT(compaction_stats[0].bytes_written, 0); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0); + } const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); - ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], 0); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); #endif // ROCKSDB_LITE } +#ifndef ROCKSDB_LITE +TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.allow_2pc = true; + options.atomic_flush = GetParam(); + // 64MB so that memtable flush won't be trigger by the small writes. + options.write_buffer_size = (static_cast(64) << 20); + + // Destroy the DB to recreate as a TransactionDB. + Close(); + Destroy(options, true); + + // Create a TransactionDB. + TransactionDB* txn_db = nullptr; + TransactionDBOptions txn_db_opts; + txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db)); + ASSERT_NE(txn_db, nullptr); + db_ = txn_db; + + // Create two more columns other than default CF. + std::vector cfs = {"puppy", "kitty"}; + CreateColumnFamilies(cfs, options); + ASSERT_EQ(handles_.size(), 2); + ASSERT_EQ(handles_[0]->GetName(), cfs[0]); + ASSERT_EQ(handles_[1]->GetName(), cfs[1]); + const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1; + + WriteOptions wopts; + TransactionOptions txn_opts; + // txn1 only prepare, but does not commit. + // The WAL containing the prepared but uncommitted data must be kept. + Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + // txn2 not only prepare, but also commit. + Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr); + ASSERT_NE(txn1, nullptr); + ASSERT_NE(txn2, nullptr); + for (size_t i = 0; i < kNumCfToFlush; i++) { + ASSERT_OK(txn1->Put(handles_[i], "k1", "v1")); + ASSERT_OK(txn2->Put(handles_[i], "k2", "v2")); + } + // A txn must be named before prepare. + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn2->SetName("txn2")); + // Prepare writes to WAL, but not to memtable. (WriteCommitted) + ASSERT_OK(txn1->Prepare()); + ASSERT_OK(txn2->Prepare()); + // Commit writes to memtable. + ASSERT_OK(txn2->Commit()); + delete txn1; + delete txn2; + + // There are still data in memtable not flushed. + // But since data is small enough to reside in the active memtable, + // there are no immutable memtable. + for (size_t i = 0; i < kNumCfToFlush; i++) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + + // Atomic flush memtables, + // the min log with prepared data should be written to MANIFEST. + std::vector cfs_to_flush(kNumCfToFlush); + for (size_t i = 0; i < kNumCfToFlush; i++) { + cfs_to_flush[i] = handles_[i]; + } + ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush)); + + // There are no remaining data in memtable after flush. + for (size_t i = 0; i < kNumCfToFlush; i++) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); + ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); + } + + // The recovered min log number with prepared data should be non-zero. + // In 2pc mode, MinLogNumberToKeep returns the + // VersionSet::min_log_number_to_keep_2pc recovered from MANIFEST, if it's 0, + // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST. + cfs.push_back(kDefaultColumnFamilyName); + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + DBImpl* db_impl = reinterpret_cast(db_); + ASSERT_TRUE(db_impl->allow_2pc()); + ASSERT_NE(db_impl->MinLogNumberToKeep(), 0); +} +#endif // ROCKSDB_LITE + TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -621,18 +1343,84 @@ TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { for (size_t i = 0; i != num_cfs; ++i) { ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); } + + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); + ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty()); + } + std::vector cf_ids; for (size_t i = 0; i != num_cfs; ++i) { cf_ids.emplace_back(static_cast(i)); } ASSERT_OK(Flush(cf_ids)); + for (size_t i = 0; i != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); + ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); } } +TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + options.write_buffer_size = (static_cast(64) << 20); + CreateAndReopenWithCF({"pikachu"}, options); + + const size_t num_cfs = handles_.size(); + ASSERT_EQ(num_cfs, 2); + WriteOptions wopts; + for (size_t i = 0; i != num_cfs; ++i) { + ASSERT_OK(Put(static_cast(i) /*cf*/, "key", "value", wopts)); + } + + { + // Flush the default CF only. + std::vector cf_ids{0}; + ASSERT_OK(Flush(cf_ids)); + + autovector flushed_cfds; + autovector> flush_edits; + auto flushed_cfh = static_cast(handles_[0]); + flushed_cfds.push_back(flushed_cfh->cfd()); + flush_edits.push_back({}); + auto unflushed_cfh = static_cast(handles_[1]); + + ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->TEST_GetVersionSet(), + flushed_cfds, flush_edits), + unflushed_cfh->cfd()->GetLogNumber()); + } + + { + // Flush all CFs. + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + cf_ids.emplace_back(static_cast(i)); + } + ASSERT_OK(Flush(cf_ids)); + uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber(); + + uint64_t min_log_number_to_keep = port::kMaxUint64; + autovector flushed_cfds; + autovector> flush_edits; + for (size_t i = 0; i != num_cfs; ++i) { + auto cfh = static_cast(handles_[i]); + flushed_cfds.push_back(cfh->cfd()); + flush_edits.push_back({}); + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber()); + } + ASSERT_EQ(min_log_number_to_keep, log_num_after_flush); + ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->TEST_GetVersionSet(), + flushed_cfds, flush_edits), + min_log_number_to_keep); + } +} + TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -713,7 +1501,8 @@ TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) { fault_injection_env->SetFilesystemActive(false); TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2"); for (auto* cfh : handles_) { - dbfull()->TEST_WaitForFlushMemTable(cfh); + // Returns the IO error happend during flush. + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh)); } for (size_t i = 0; i != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); diff --git a/db/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc similarity index 93% rename from db/compacted_db_impl.cc rename to db/db_impl/compacted_db_impl.cc index cd4f27b9e63..076ce818fd1 100644 --- a/db/compacted_db_impl.cc +++ b/db/db_impl/compacted_db_impl.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "db/compacted_db_impl.h" +#include "db/db_impl/compacted_db_impl.h" #include "db/db_impl/db_impl.h" #include "db/version_set.h" @@ -17,11 +17,13 @@ extern void MarkKeyMayExist(void* arg); extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, const Slice& v, bool hit_and_return); -CompactedDBImpl::CompactedDBImpl( - const DBOptions& options, const std::string& dbname) - : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr), - user_comparator_(nullptr) { -} +CompactedDBImpl::CompactedDBImpl(const DBOptions& options, + const std::string& dbname) + : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true, + /*read_only*/ true), + cfd_(nullptr), + version_(nullptr), + user_comparator_(nullptr) {} CompactedDBImpl::~CompactedDBImpl() { } @@ -78,6 +80,7 @@ std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, nullptr, nullptr, nullptr, true, nullptr, nullptr); LookupKey lkey(keys[idx], kMaxSequenceNumber); Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr); + assert(static_cast(idx) < statuses.size()); if (!s.ok() && !s.IsNotFound()) { statuses[idx] = s; } else { diff --git a/db/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h similarity index 96% rename from db/compacted_db_impl.h rename to db/db_impl/compacted_db_impl.h index 7099566fc81..4cf00785e2f 100644 --- a/db/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -18,7 +18,7 @@ class CompactedDBImpl : public DBImpl { CompactedDBImpl(const CompactedDBImpl&) = delete; void operator=(const CompactedDBImpl&) = delete; - virtual ~CompactedDBImpl(); + ~CompactedDBImpl() override; static Status Open(const Options& options, const std::string& dbname, DB** dbptr); @@ -82,6 +82,11 @@ class CompactedDBImpl : public DBImpl { ColumnFamilyHandle* /*column_family*/) override { return Status::NotSupported("Not supported in compacted db mode."); } + + virtual Status SyncWAL() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DB::IngestExternalFile; virtual Status IngestExternalFile( ColumnFamilyHandle* /*column_family*/, diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 29b4ac10abd..4e64963ed68 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -53,7 +53,6 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "db/write_callback.h" -#include "env/composite_env_wrapper.h" #include "file/file_util.h" #include "file/filename.h" #include "file/random_access_file_reader.h" @@ -83,6 +82,7 @@ #include "rocksdb/stats_history.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" @@ -94,7 +94,6 @@ #include "table/two_level_iterator.h" #include "test_util/sync_point.h" #include "util/autovector.h" -#include "util/build_version.h" #include "util/cast_util.h" #include "util/coding.h" #include "util/compression.h" @@ -147,27 +146,31 @@ void DumpSupportInfo(Logger* logger) { } // namespace DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, - const bool seq_per_batch, const bool batch_per_txn) + const bool seq_per_batch, const bool batch_per_txn, + bool read_only) : dbname_(dbname), own_info_log_(options.info_log == nullptr), - initial_db_options_(SanitizeOptions(dbname, options)), + initial_db_options_(SanitizeOptions(dbname, options, read_only)), env_(initial_db_options_.env), io_tracer_(std::make_shared()), immutable_db_options_(initial_db_options_), fs_(immutable_db_options_.fs, io_tracer_), mutable_db_options_(initial_db_options_), - stats_(immutable_db_options_.statistics.get()), - mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, + stats_(immutable_db_options_.stats), + mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, immutable_db_options_.use_adaptive_mutex), default_cf_handle_(nullptr), + error_handler_(this, immutable_db_options_, &mutex_), + event_logger_(immutable_db_options_.info_log.get()), max_total_in_memory_state_(0), file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)), file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite( file_options_, immutable_db_options_)), seq_per_batch_(seq_per_batch), batch_per_txn_(batch_per_txn), - db_lock_(nullptr), + next_job_id_(1), shutting_down_(false), + db_lock_(nullptr), manual_compaction_paused_(false), bg_cv_(&mutex_), logfile_number_(0), @@ -192,9 +195,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, bg_purge_scheduled_(0), disable_delete_obsolete_files_(0), pending_purge_obsolete_files_(0), - delete_obsolete_files_last_run_(env_->NowMicros()), + delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()), last_stats_dump_time_microsec_(0), - next_job_id_(1), has_unpersisted_data_(false), unable_to_release_oldest_log_(false), num_running_ingest_file_(0), @@ -202,7 +204,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, wal_manager_(immutable_db_options_, file_options_, io_tracer_, seq_per_batch), #endif // ROCKSDB_LITE - event_logger_(immutable_db_options_.info_log.get()), bg_work_paused_(0), bg_compaction_paused_(0), refitting_level_(false), @@ -231,8 +232,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, own_sfm_(options.sst_file_manager == nullptr), preserve_deletes_(options.preserve_deletes), closed_(false), - error_handler_(this, immutable_db_options_, &mutex_), - atomic_flush_install_cv_(&mutex_) { + atomic_flush_install_cv_(&mutex_), + blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_, + &error_handler_) { // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); @@ -249,16 +251,17 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; co.metadata_charge_policy = kDontChargeCacheMetadata; table_cache_ = NewLRUCache(co); + SetDbSessionId(); + assert(!db_session_id_.empty()); versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, table_cache_.get(), write_buffer_manager_, &write_controller_, &block_cache_tracer_, - io_tracer_)); + io_tracer_, db_session_id_)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); DumpRocksDBBuildVersion(immutable_db_options_.info_log.get()); - SetDbSessionId(); DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_); immutable_db_options_.Dump(immutable_db_options_.info_log.get()); mutable_db_options_.Dump(immutable_db_options_.info_log.get()); @@ -268,6 +271,10 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber() // is called by client and this seqnum is advanced. preserve_deletes_seqnum_.store(0); + + if (write_buffer_manager_) { + wbm_stall_.reset(new WBMStallInterface()); + } } Status DBImpl::Resume() { @@ -307,18 +314,21 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); WaitForBackgroundWork(); - Status bg_error = error_handler_.GetBGError(); Status s; if (shutdown_initiated_) { // Returning shutdown status to SFM during auto recovery will cause it // to abort the recovery and allow the shutdown to progress s = Status::ShutdownInProgress(); } - if (s.ok() && bg_error.severity() > Status::Severity::kHardError) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "DB resume requested but failed due to Fatal/Unrecoverable error"); - s = bg_error; + + if (s.ok()) { + Status bg_error = error_handler_.GetBGError(); + if (bg_error.severity() > Status::Severity::kHardError) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but failed due to Fatal/Unrecoverable error"); + s = bg_error; + } } // Make sure the IO Status stored in version set is set to OK. @@ -393,6 +403,11 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { FindObsoleteFiles(&job_context, true); if (s.ok()) { s = error_handler_.ClearBGError(); + } else { + // NOTE: this is needed to pass ASSERT_STATUS_CHECKED + // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test. + // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952 + error_handler_.GetRecoveryError().PermitUncheckedError(); } mutex_.Unlock(); @@ -409,6 +424,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { if (file_deletion_disabled) { // Always return ok s = EnableFileDeletions(/*force=*/true); + if (!s.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "DB resume requested but could not enable file deletions [%s]", + s.ToString().c_str()); + } } ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); } @@ -460,7 +481,9 @@ void DBImpl::CancelAllBackgroundWork(bool wait) { autovector cfds; SelectColumnFamiliesForAtomicFlush(&cfds); mutex_.Unlock(); - AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + Status s = + AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? mutex_.Lock(); } else { for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -496,19 +519,20 @@ Status DBImpl::CloseHelper() { } mutex_.Unlock(); + // Below check is added as recovery_error_ is not checked and it causes crash + // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is + // reached. + error_handler_.GetRecoveryError().PermitUncheckedError(); + // CancelAllBackgroundWork called with false means we just set the shutdown // marker. After this we do a variant of the waiting and unschedule work // (to consider: moving all the waiting into CancelAllBackgroundWork(true)) CancelAllBackgroundWork(false); - int bottom_compactions_unscheduled = - env_->UnSchedule(this, Env::Priority::BOTTOM); - int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); - int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH); - Status ret = Status::OK(); mutex_.Lock(); - bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled; - bg_compaction_scheduled_ -= compactions_unscheduled; - bg_flush_scheduled_ -= flushes_unscheduled; + env_->UnSchedule(this, Env::Priority::BOTTOM); + env_->UnSchedule(this, Env::Priority::LOW); + env_->UnSchedule(this, Env::Priority::HIGH); + Status ret = Status::OK(); // Wait for background work to finish while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || @@ -636,11 +660,15 @@ Status DBImpl::CloseHelper() { if (immutable_db_options_.info_log && own_info_log_) { Status s = immutable_db_options_.info_log->Close(); - if (!s.ok() && ret.ok()) { + if (!s.ok() && !s.IsNotSupported() && ret.ok()) { ret = s; } } + if (write_buffer_manager_ && wbm_stall_) { + write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get()); + } + if (ret.IsAborted()) { // Reserve IsAborted() error for those where users didn't release // certain resource and they can release them and come back and @@ -670,8 +698,8 @@ void DBImpl::MaybeIgnoreError(Status* s) const { } const Status DBImpl::CreateArchivalDirectory() { - if (immutable_db_options_.wal_ttl_seconds > 0 || - immutable_db_options_.wal_size_limit_mb > 0) { + if (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0) { std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir); return env_->CreateDirIfMissing(archivalPath); } @@ -679,7 +707,7 @@ const Status DBImpl::CreateArchivalDirectory() { } void DBImpl::PrintStatistics() { - auto dbstats = immutable_db_options_.statistics.get(); + auto dbstats = immutable_db_options_.stats; if (dbstats) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s", dbstats->ToString().c_str()); @@ -688,6 +716,18 @@ void DBImpl::PrintStatistics() { void DBImpl::StartPeriodicWorkScheduler() { #ifndef ROCKSDB_LITE + +#ifndef NDEBUG + // It only used by test to disable scheduler + bool disable_scheduler = false; + TEST_SYNC_POINT_CALLBACK( + "DBImpl::StartPeriodicWorkScheduler:DisableScheduler", + &disable_scheduler); + if (disable_scheduler) { + return; + } +#endif // !NDEBUG + { InstrumentedMutexLock l(&mutex_); periodic_work_scheduler_ = PeriodicWorkScheduler::Default(); @@ -725,9 +765,10 @@ void DBImpl::PersistStats() { return; } TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning"); - uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond; + uint64_t now_seconds = + immutable_db_options_.clock->NowMicros() / kMicrosInSecond; - Statistics* statistics = immutable_db_options_.statistics.get(); + Statistics* statistics = immutable_db_options_.stats; if (!statistics) { return; } @@ -866,13 +907,6 @@ Status DBImpl::GetStatsHistory( void DBImpl::DumpStats() { TEST_SYNC_POINT("DBImpl::DumpStats:1"); #ifndef ROCKSDB_LITE - const DBPropertyInfo* cf_property_info = - GetPropertyInfo(DB::Properties::kCFStats); - assert(cf_property_info != nullptr); - const DBPropertyInfo* db_property_info = - GetPropertyInfo(DB::Properties::kDBStats); - assert(db_property_info != nullptr); - std::string stats; if (shutdown_initiated_) { return; @@ -880,18 +914,29 @@ void DBImpl::DumpStats() { TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); { InstrumentedMutexLock l(&mutex_); - default_cf_internal_stats_->GetStringProperty( - *db_property_info, DB::Properties::kDBStats, &stats); + const std::string* property = &DB::Properties::kDBStats; + const DBPropertyInfo* property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); + default_cf_internal_stats_->GetStringProperty(*property_info, *property, + &stats); + + property = &DB::Properties::kCFStatsNoFileHistogram; + property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->initialized()) { - cfd->internal_stats()->GetStringProperty( - *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats); + cfd->internal_stats()->GetStringProperty(*property_info, *property, + &stats); } } + + property = &DB::Properties::kCFFileHistogram; + property_info = GetPropertyInfo(*property); + assert(property_info != nullptr); for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->initialized()) { - cfd->internal_stats()->GetStringProperty( - *cf_property_info, DB::Properties::kCFFileHistogram, &stats); + cfd->internal_stats()->GetStringProperty(*property_info, *property, + &stats); } } } @@ -1281,7 +1326,11 @@ Status DBImpl::SyncWAL() { TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); { InstrumentedMutexLock l(&mutex_); - MarkLogsSynced(current_log_number, need_log_dir_sync, status); + if (status.ok()) { + status = MarkLogsSynced(current_log_number, need_log_dir_sync); + } else { + MarkLogsNotSynced(current_log_number); + } } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -1307,27 +1356,54 @@ Status DBImpl::UnlockWAL() { return Status::OK(); } -void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir, - const Status& status) { +Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) { mutex_.AssertHeld(); - if (synced_dir && logfile_number_ == up_to && status.ok()) { + if (synced_dir && logfile_number_ == up_to) { log_dir_synced_ = true; } + VersionEdit synced_wals; for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { - auto& log = *it; - assert(log.getting_synced); - if (status.ok() && logs_.size() > 1) { - logs_to_free_.push_back(log.ReleaseWriter()); + auto& wal = *it; + assert(wal.getting_synced); + if (logs_.size() > 1) { + if (immutable_db_options_.track_and_verify_wals_in_manifest && + wal.writer->file()->GetFileSize() > 0) { + synced_wals.AddWal(wal.number, + WalMetadata(wal.writer->file()->GetFileSize())); + } + logs_to_free_.push_back(wal.ReleaseWriter()); // To modify logs_ both mutex_ and log_write_mutex_ must be held InstrumentedMutexLock l(&log_write_mutex_); it = logs_.erase(it); } else { - log.getting_synced = false; + wal.getting_synced = false; ++it; } } - assert(!status.ok() || logs_.empty() || logs_[0].number > up_to || + assert(logs_.empty() || logs_[0].number > up_to || (logs_.size() == 1 && !logs_[0].getting_synced)); + + Status s; + if (synced_wals.IsWalAddition()) { + // not empty, write to MANIFEST. + s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + } + log_sync_cv_.SignalAll(); + return s; +} + +void DBImpl::MarkLogsNotSynced(uint64_t up_to) { + mutex_.AssertHeld(); + for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to; + ++it) { + auto& wal = *it; + assert(wal.getting_synced); + wal.getting_synced = false; + } log_sync_cv_.SignalAll(); } @@ -1595,8 +1671,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, } #endif // NDEBUG - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_GET); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); auto cfh = static_cast_with_check( @@ -1660,7 +1736,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, } // If timestamp is used, we use read callback to ensure is returned // only if t <= read_opts.timestamp and s <= snapshot. - if (ts_sz > 0 && !get_impl_options.callback) { + if (ts_sz > 0) { + assert(!get_impl_options + .callback); // timestamp with callback is not supported read_cb.Refresh(snapshot); get_impl_options.callback = &read_cb; } @@ -1784,8 +1862,8 @@ std::vector DBImpl::MultiGet( const std::vector& column_family, const std::vector& keys, std::vector* values, std::vector* timestamps) { - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_MULTIGET); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); PERF_TIMER_GUARD(get_snapshot_time); #ifndef NDEBUG @@ -1802,6 +1880,16 @@ std::vector DBImpl::MultiGet( } #endif // NDEBUG + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(column_family, keys).PermitUncheckedError(); + } + } + SequenceNumber consistent_seqnum; std::unordered_map multiget_cf_data( @@ -1914,9 +2002,8 @@ std::vector DBImpl::MultiGet( break; } } - if (read_options.deadline.count() && - env_->NowMicros() > + immutable_db_options_.clock->NowMicros() > static_cast(read_options.deadline.count())) { break; } @@ -1925,8 +2012,8 @@ std::vector DBImpl::MultiGet( if (keys_read < num_keys) { // The only reason to break out of the loop is when the deadline is // exceeded - assert(env_->NowMicros() > - static_cast(read_options.deadline.count())); + assert(immutable_db_options_.clock->NowMicros() > + static_cast(read_options.deadline.count())); for (++keys_read; keys_read < num_keys; ++keys_read) { stat_list[keys_read] = Status::TimedOut(); } @@ -2114,6 +2201,16 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, } #endif // NDEBUG + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError(); + } + } + autovector key_context; autovector sorted_keys; sorted_keys.resize(num_keys); @@ -2172,7 +2269,7 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, for (; cf_iter != multiget_cf_data.end(); ++cf_iter) { s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys, cf_iter->super_version, consistent_seqnum, - read_callback, nullptr); + read_callback); if (!s.ok()) { break; } @@ -2276,6 +2373,15 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const Slice* keys, PinnableSlice* values, std::string* timestamps, Status* statuses, const bool sorted_input) { + if (tracer_) { + // TODO: This mutex should be removed later, to improve performance when + // tracing is enabled. + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_) { + // TODO: maybe handle the tracing status? + tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError(); + } + } autovector key_context; autovector sorted_keys; sorted_keys.resize(num_keys); @@ -2336,15 +2442,16 @@ void DBImpl::MultiGetWithCallback( } GetWithTimestampReadCallback timestamp_read_callback(0); - ReadCallback* read_callback = nullptr; + ReadCallback* read_callback = callback; if (read_options.timestamp && read_options.timestamp->size() > 0) { + assert(!read_callback); // timestamp with callback is not supported timestamp_read_callback.Refresh(consistent_seqnum); read_callback = ×tamp_read_callback; } Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys, multiget_cf_data[0].super_version, consistent_seqnum, - read_callback, nullptr); + read_callback); assert(s.ok() || s.IsTimedOut() || s.IsAborted()); ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, multiget_cf_data[0].super_version); @@ -2363,9 +2470,9 @@ Status DBImpl::MultiGetImpl( const ReadOptions& read_options, size_t start_key, size_t num_keys, autovector* sorted_keys, SuperVersion* super_version, SequenceNumber snapshot, - ReadCallback* callback, bool* is_blob_index) { - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_MULTIGET); + ReadCallback* callback) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). @@ -2376,7 +2483,7 @@ Status DBImpl::MultiGetImpl( uint64_t curr_value_size = 0; while (keys_left) { if (read_options.deadline.count() && - env_->NowMicros() > + immutable_db_options_.clock->NowMicros() > static_cast(read_options.deadline.count())) { s = Status::TimedOut(); break; @@ -2402,11 +2509,9 @@ Status DBImpl::MultiGetImpl( (read_options.read_tier == kPersistedTier && has_unpersisted_data_.load(std::memory_order_relaxed)); if (!skip_memtable) { - super_version->mem->MultiGet(read_options, &range, callback, - is_blob_index); + super_version->mem->MultiGet(read_options, &range, callback); if (!range.empty()) { - super_version->imm->MultiGet(read_options, &range, callback, - is_blob_index); + super_version->imm->MultiGet(read_options, &range, callback); } if (!range.empty()) { lookup_current = true; @@ -2416,8 +2521,7 @@ Status DBImpl::MultiGetImpl( } if (lookup_current) { PERF_TIMER_GUARD(get_from_output_files_time); - super_version->current->MultiGet(read_options, &range, callback, - is_blob_index); + super_version->current->MultiGet(read_options, &range, callback); } curr_value_size = range.GetValueSize(); if (curr_value_size > read_options.value_size_soft_limit) { @@ -2771,7 +2875,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, /* allow_unprepared_value */ true); result = NewDBIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, - cfd->user_comparator(), iter, kMaxSequenceNumber, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, this, cfd); #endif @@ -2792,7 +2896,7 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, ColumnFamilyData* cfd, SequenceNumber snapshot, ReadCallback* read_callback, - bool allow_blob, + bool expose_blob_index, bool allow_refresh) { SuperVersion* sv = cfd->GetReferencedSuperVersion(this); @@ -2857,9 +2961,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations, - sv->version_number, read_callback, this, cfd, allow_blob, + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, + snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, read_callback, this, cfd, expose_blob_index, read_options.snapshot != nullptr ? false : allow_refresh); InternalIterator* internal_iter = NewInternalIterator( @@ -2897,7 +3001,7 @@ Status DBImpl::NewIterators( /* allow_unprepared_value */ true); iterators->push_back(NewDBIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, - cfd->user_comparator(), iter, kMaxSequenceNumber, + cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, this, cfd)); } @@ -2932,7 +3036,8 @@ const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary, bool lock) { int64_t unix_time = 0; - env_->GetCurrentTime(&unix_time).PermitUncheckedError(); // Ignore error + immutable_db_options_.clock->GetCurrentTime(&unix_time) + .PermitUncheckedError(); // Ignore error SnapshotImpl* s = new SnapshotImpl; if (lock) { @@ -3069,20 +3174,25 @@ const std::string& DBImpl::GetName() const { return dbname_; } Env* DBImpl::GetEnv() const { return env_; } FileSystem* DB::GetFileSystem() const { - static LegacyFileSystemWrapper fs_wrap(GetEnv()); - return &fs_wrap; + const auto& fs = GetEnv()->GetFileSystem(); + return fs.get(); } FileSystem* DBImpl::GetFileSystem() const { return immutable_db_options_.fs.get(); } +SystemClock* DBImpl::GetSystemClock() const { + return immutable_db_options_.clock; +} + #ifndef ROCKSDB_LITE -Status DBImpl::StartIOTrace(Env* env, const TraceOptions& trace_options, +Status DBImpl::StartIOTrace(const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { assert(trace_writer != nullptr); - return io_tracer_->StartIOTrace(env, trace_options, std::move(trace_writer)); + return io_tracer_->StartIOTrace(GetSystemClock(), trace_options, + std::move(trace_writer)); } Status DBImpl::EndIOTrace() { @@ -3201,7 +3311,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { assert(value != nullptr); - Statistics* statistics = immutable_db_options_.statistics.get(); + Statistics* statistics = immutable_db_options_.stats; if (!statistics) { return false; } @@ -3368,6 +3478,10 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, return Status::InvalidArgument("Invalid options"); } + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + Version* v; auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); @@ -3375,9 +3489,23 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, v = sv->current; for (int i = 0; i < n; i++) { + Slice start = range[i].start; + Slice limit = range[i].limit; + + // Add timestamp if needed + std::string start_with_ts, limit_with_ts; + if (ts_sz > 0) { + // Maximum timestamp means including all key with any timestamp + AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz); + // Append a maximum timestamp as the range limit is exclusive: + // [start, limit) + AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz); + start = start_with_ts; + limit = limit_with_ts; + } // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( @@ -3429,14 +3557,13 @@ Status DBImpl::DeleteFile(std::string name) { FileType type; WalFileType log_type; if (!ParseFileName(name, &number, &type, &log_type) || - (type != kTableFile && type != kLogFile)) { + (type != kTableFile && type != kWalFile)) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n", name.c_str()); return Status::InvalidArgument("Invalid file name"); } - Status status; - if (type == kLogFile) { + if (type == kWalFile) { // Only allow deleting archived log files if (log_type != kArchivedLogFile) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, @@ -3444,7 +3571,7 @@ Status DBImpl::DeleteFile(std::string name) { name.c_str()); return Status::NotSupported("Delete only supported for archived logs"); } - status = wal_manager_.DeleteFile(name, number); + Status status = wal_manager_.DeleteFile(name, number); if (!status.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed -- %s.\n", name.c_str(), @@ -3453,6 +3580,7 @@ Status DBImpl::DeleteFile(std::string name) { return status; } + Status status; int level; FileMetaData* metadata; ColumnFamilyData* cfd; @@ -3526,7 +3654,7 @@ Status DBImpl::DeleteFile(std::string name) { Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { - Status status; + Status status = Status::OK(); auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); VersionEdit edit; @@ -3581,11 +3709,13 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, deleted_files.insert(level_file); level_file->being_compacted = true; } + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); } } if (edit.GetDeletedFiles().empty()) { job_context.Clean(); - return Status::OK(); + return status; } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), @@ -3861,7 +3991,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, std::string path_to_delete = dbname + "/" + fname; if (type == kMetaDatabase) { del = DestroyDB(path_to_delete, options); - } else if (type == kTableFile || type == kLogFile) { + } else if (type == kTableFile || type == kWalFile || + type == kBlobFile) { del = DeleteDBFile(&soptions, path_to_delete, dbname, /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); } else { @@ -3886,9 +4017,10 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (env->GetChildren(path, &filenames).ok()) { for (const auto& fname : filenames) { if (ParseFileName(fname, &number, &type) && - type == kTableFile) { // Lock file will be deleted at end - std::string table_path = path + "/" + fname; - Status del = DeleteDBFile(&soptions, table_path, dbname, + (type == kTableFile || + type == kBlobFile)) { // Lock file will be deleted at end + std::string file_path = path + "/" + fname; + Status del = DeleteDBFile(&soptions, file_path, dbname, /*force_bg=*/false, /*force_fg=*/false); if (!del.ok() && result.ok()) { result = del; @@ -3915,7 +4047,7 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (env->GetChildren(archivedir, &archiveFiles).ok()) { // Delete archival files. for (const auto& file : archiveFiles) { - if (ParseFileName(file, &number, &type) && type == kLogFile) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { Status del = DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); @@ -3931,7 +4063,7 @@ Status DestroyDB(const std::string& dbname, const Options& options, // Delete log files in the WAL dir if (wal_dir_exists) { for (const auto& file : walDirFiles) { - if (ParseFileName(file, &number, &type) && type == kLogFile) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { Status del = DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), soptions.wal_dir, /*force_bg=*/false, @@ -4136,16 +4268,17 @@ void DBImpl::EraseThreadStatusDbInfo() const {} // // A global method that can dump out the build version void DumpRocksDBBuildVersion(Logger* log) { -#if !defined(IOS_CROSS_COMPILE) - // if we compile with Xcode, we don't run build_detect_version, so we don't - // generate util/build_version.cc - ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, - ROCKSDB_MINOR, ROCKSDB_PATCH); - ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha); - ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date); -#else - (void)log; // ignore "-Wunused-parameter" -#endif + ROCKS_LOG_HEADER(log, "RocksDB version: %s\n", + GetRocksVersionAsString().c_str()); + const auto& props = GetRocksBuildProperties(); + const auto& sha = props.find("rocksdb_build_git_sha"); + if (sha != props.end()) { + ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str()); + } + const auto date = props.find("rocksdb_build_date"); + if (date != props.end()) { + ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str()); + } } #ifndef ROCKSDB_LITE @@ -4310,7 +4443,7 @@ Status DBImpl::IngestExternalFiles( } } // Ingest multiple external SST files atomically. - size_t num_cfs = args.size(); + const size_t num_cfs = args.size(); for (size_t i = 0; i != num_cfs; ++i) { if (args[i].external_files.empty()) { char err_msg[128] = {0}; @@ -4347,14 +4480,11 @@ Status DBImpl::IngestExternalFiles( std::vector ingestion_jobs; for (const auto& arg : args) { auto* cfd = static_cast(arg.column_family)->cfd(); - ingestion_jobs.emplace_back( - env_, versions_.get(), cfd, immutable_db_options_, file_options_, - &snapshots_, arg.options, &directories_, &event_logger_, io_tracer_); - } - std::vector> exec_results; - for (size_t i = 0; i != num_cfs; ++i) { - exec_results.emplace_back(false, Status::OK()); + ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_, + file_options_, &snapshots_, arg.options, + &directories_, &event_logger_, io_tracer_); } + // TODO(yanqin) maybe make jobs run in parallel uint64_t start_file_number = next_file_number; for (size_t i = 1; i != num_cfs; ++i) { @@ -4362,10 +4492,13 @@ Status DBImpl::IngestExternalFiles( auto* cfd = static_cast(args[i].column_family)->cfd(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); - exec_results[i].second = ingestion_jobs[i].Prepare( + Status es = ingestion_jobs[i].Prepare( args[i].external_files, args[i].files_checksums, args[i].files_checksum_func_names, start_file_number, super_version); - exec_results[i].first = true; + // capture first error only + if (!es.ok() && status.ok()) { + status = es; + } CleanupSuperVersion(super_version); } TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); @@ -4374,23 +4507,17 @@ Status DBImpl::IngestExternalFiles( auto* cfd = static_cast(args[0].column_family)->cfd(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); - exec_results[0].second = ingestion_jobs[0].Prepare( + Status es = ingestion_jobs[0].Prepare( args[0].external_files, args[0].files_checksums, args[0].files_checksum_func_names, next_file_number, super_version); - exec_results[0].first = true; - CleanupSuperVersion(super_version); - } - for (const auto& exec_result : exec_results) { - if (!exec_result.second.ok()) { - status = exec_result.second; - break; + if (!es.ok()) { + status = es; } + CleanupSuperVersion(super_version); } if (!status.ok()) { for (size_t i = 0; i != num_cfs; ++i) { - if (exec_results[i].first) { - ingestion_jobs[i].Cleanup(status); - } + ingestion_jobs[i].Cleanup(status); } InstrumentedMutexLock l(&mutex_); ReleaseFileNumberFromPendingOutputs(pending_output_elem); @@ -4560,8 +4687,7 @@ Status DBImpl::IngestExternalFiles( // TODO: distinguish between MANIFEST write and CURRENT renaming const IOStatus& io_s = versions_->io_status(); // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); } // Resume writes to the DB @@ -4623,9 +4749,9 @@ Status DBImpl::CreateColumnFamilyWithImport( // Import sst files from metadata. auto cfh = static_cast_with_check(*handle); auto cfd = cfh->cfd(); - ImportColumnFamilyJob import_job(env_, versions_.get(), cfd, - immutable_db_options_, file_options_, - import_options, metadata.files, io_tracer_); + ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_, + file_options_, import_options, + metadata.files, io_tracer_); SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); VersionEdit dummy_edit; @@ -4782,22 +4908,42 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, j++) { const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j]; const auto& fd = fd_with_krange.fd; + const FileMetaData* fmeta = fd_with_krange.file_metadata; + assert(fmeta); std::string fname = TableFileName(cfd->ioptions()->cf_paths, fd.GetNumber(), fd.GetPathId()); if (use_file_checksum) { - const FileMetaData* fmeta = fd_with_krange.file_metadata; - assert(fmeta); - s = VerifySstFileChecksum(*fmeta, fname, read_options); + s = VerifyFullFileChecksum(fmeta->file_checksum, + fmeta->file_checksum_func_name, fname, + read_options); } else { s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_, read_options, fname); } } } + + if (s.ok() && use_file_checksum) { + const auto& blob_files = vstorage->GetBlobFiles(); + for (const auto& pair : blob_files) { + const uint64_t blob_file_number = pair.first; + const auto& meta = pair.second; + assert(meta); + const std::string blob_file_name = BlobFileName( + cfd->ioptions()->cf_paths.front().path, blob_file_number); + s = VerifyFullFileChecksum(meta->GetChecksumValue(), + meta->GetChecksumMethod(), blob_file_name, + read_options); + if (!s.ok()) { + break; + } + } + } if (!s.ok()) { break; } } + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; { @@ -4822,29 +4968,31 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, return s; } -Status DBImpl::VerifySstFileChecksum(const FileMetaData& fmeta, - const std::string& fname, - const ReadOptions& read_options) { +Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fname, + const ReadOptions& read_options) { Status s; - if (fmeta.file_checksum == kUnknownFileChecksum) { + if (file_checksum_expected == kUnknownFileChecksum) { return s; } std::string file_checksum; std::string func_name; s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum( fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), - fmeta.file_checksum_func_name, &file_checksum, &func_name, + func_name_expected, &file_checksum, &func_name, read_options.readahead_size, immutable_db_options_.allow_mmap_reads, - io_tracer_); + io_tracer_, immutable_db_options_.rate_limiter.get()); if (s.ok()) { - assert(fmeta.file_checksum_func_name == func_name); - if (file_checksum != fmeta.file_checksum) { + assert(func_name_expected == func_name); + if (file_checksum != file_checksum_expected) { std::ostringstream oss; oss << fname << " file checksum mismatch, "; - oss << "expecting " << Slice(fmeta.file_checksum).ToString(/*hex=*/true); + oss << "expecting " + << Slice(file_checksum_expected).ToString(/*hex=*/true); oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true); s = Status::Corruption(oss.str()); - TEST_SYNC_POINT_CALLBACK("DBImpl::VerifySstFileChecksum:mismatch", &s); + TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s); } } return s; @@ -4879,7 +5027,8 @@ void DBImpl::WaitForIngestFile() { Status DBImpl::StartTrace(const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { InstrumentedMutexLock lock(&trace_mutex_); - tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer))); + tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options, + std::move(trace_writer))); return Status::OK(); } @@ -4898,8 +5047,8 @@ Status DBImpl::EndTrace() { Status DBImpl::StartBlockCacheTrace( const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { - return block_cache_tracer_.StartTrace(env_, trace_options, - std::move(trace_writer)); + return block_cache_tracer_.StartTrace(immutable_db_options_.clock, + trace_options, std::move(trace_writer)); } Status DBImpl::EndBlockCacheTrace() { @@ -4907,24 +5056,27 @@ Status DBImpl::EndBlockCacheTrace() { return Status::OK(); } -Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { +Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { Status s; if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - s = tracer_->IteratorSeek(cf_id, key); + s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound); } } return s; } -Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, - const Slice& key) { +Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound) { Status s; if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - s = tracer_->IteratorSeekForPrev(cf_id, key); + s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound); } } return s; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 5010bb6f46a..ff46896baba 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -54,9 +54,6 @@ #include "rocksdb/transaction_log.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" -#include "trace_replay/block_cache_tracer.h" -#include "trace_replay/io_tracer.h" -#include "trace_replay/trace_replay.h" #include "util/autovector.h" #include "util/hash.h" #include "util/repeatable_thread.h" @@ -132,7 +129,8 @@ class Directories { class DBImpl : public DB { public: DBImpl(const DBOptions& options, const std::string& dbname, - const bool seq_per_batch = false, const bool batch_per_txn = true); + const bool seq_per_batch = false, const bool batch_per_txn = true, + bool read_only = false); // No copying allowed DBImpl(const DBImpl&) = delete; void operator=(const DBImpl&) = delete; @@ -437,7 +435,8 @@ class DBImpl : public DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; - Status VerifyFileChecksums(const ReadOptions& read_options); + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_options) override; using DB::VerifyChecksum; virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; @@ -455,9 +454,10 @@ class DBImpl : public DB { Status VerifyChecksumInternal(const ReadOptions& read_options, bool use_file_checksum); - Status VerifySstFileChecksum(const FileMetaData& fmeta, - const std::string& fpath, - const ReadOptions& read_options); + Status VerifyFullFileChecksum(const std::string& file_checksum_expected, + const std::string& func_name_expected, + const std::string& fpath, + const ReadOptions& read_options); using DB::StartTrace; virtual Status StartTrace( @@ -476,7 +476,7 @@ class DBImpl : public DB { Status EndBlockCacheTrace() override; using DB::StartIOTrace; - Status StartIOTrace(Env* env, const TraceOptions& options, + Status StartIOTrace(const TraceOptions& options, std::unique_ptr&& trace_writer) override; using DB::EndIOTrace; @@ -493,6 +493,7 @@ class DBImpl : public DB { #endif // ROCKSDB_LITE // ---- End of implementations of the DB interface ---- + SystemClock* GetSystemClock() const; struct GetImplOptions { ColumnFamilyHandle* column_family = nullptr; @@ -527,7 +528,7 @@ class DBImpl : public DB { ColumnFamilyData* cfd, SequenceNumber snapshot, ReadCallback* read_callback, - bool allow_blob = false, + bool expose_blob_index = false, bool allow_refresh = true); virtual SequenceNumber GetLastPublishedSequence() const { @@ -600,8 +601,11 @@ class DBImpl : public DB { bool* found_record_for_key, bool* is_blob_index = nullptr); - Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); - Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); + Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, const Slice upper_bound); + Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, + const Slice& lower_bound, + const Slice upper_bound); #endif // ROCKSDB_LITE // Similar to GetSnapshot(), but also lets the db know that this snapshot @@ -948,7 +952,7 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family = nullptr, bool disallow_trivial_move = false); - void TEST_SwitchWAL(); + Status TEST_SwitchWAL(); bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } @@ -980,6 +984,9 @@ class DBImpl : public DB { // is only for the special test of CancelledCompactions Status TEST_WaitForCompact(bool waitUnscheduled = false); + // Get the background error status + Status TEST_GetBGError(); + // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. int64_t TEST_MaxNextLevelOverlappingBytes( @@ -1043,6 +1050,12 @@ class DBImpl : public DB { VersionSet* TEST_GetVersionSet() const { return versions_.get(); } + uint64_t TEST_GetCurrentLogNumber() const { + InstrumentedMutexLock l(mutex()); + assert(!logs_.empty()); + return logs_.back().number; + } + const std::unordered_set& TEST_GetFilesGrabbedForPurge() const { return files_grabbed_for_purge_; } @@ -1062,6 +1075,56 @@ class DBImpl : public DB { // flush LOG out of application buffer void FlushInfoLog(); + // Interface to block and signal the DB in case of stalling writes by + // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. + // When DB needs to be blocked or signalled by WriteBufferManager, + // state_ is changed accordingly. + class WBMStallInterface : public StallInterface { + public: + enum State { + BLOCKED = 0, + RUNNING, + }; + + WBMStallInterface() : state_cv_(&state_mutex_) { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + } + + void SetState(State state) { + MutexLock lock(&state_mutex_); + state_ = state; + } + + // Change the state_ to State::BLOCKED and wait until its state is + // changed by WriteBufferManager. When stall is cleared, Signal() is + // called to change the state and unblock the DB. + void Block() override { + MutexLock lock(&state_mutex_); + while (state_ == State::BLOCKED) { + TEST_SYNC_POINT("WBMStallInterface::BlockDB"); + state_cv_.Wait(); + } + } + + // Called from WriteBufferManager. This function changes the state_ + // to State::RUNNING indicating the stall is cleared and DB can proceed. + void Signal() override { + MutexLock lock(&state_mutex_); + state_ = State::RUNNING; + state_cv_.Signal(); + } + + private: + // Conditional variable and mutex to block and + // signal the DB during stalling process. + port::Mutex state_mutex_; + port::CondVar state_cv_; + // state represting whether DB is running or blocked because of stall by + // WriteBufferManager. + State state_; + }; + protected: const std::string dbname_; std::string db_id_; @@ -1094,6 +1157,14 @@ class DBImpl : public DB { ColumnFamilyHandleImpl* default_cf_handle_; InternalStats* default_cf_internal_stats_; + // table_cache_ provides its own synchronization + std::shared_ptr table_cache_; + + ErrorHandler error_handler_; + + // Unified interface for logging events + EventLogger event_logger_; + // only used for dynamically adjusting max_total_wal_size. it is a sum of // [write_buffer_size * max_write_buffer_number] over all column families uint64_t max_total_in_memory_state_; @@ -1124,12 +1195,27 @@ class DBImpl : public DB { // Default: true const bool batch_per_txn_; + // Each flush or compaction gets its own job id. this counter makes sure + // they're unique + std::atomic next_job_id_; + + std::atomic shutting_down_; + // Except in DB::Open(), WriteOptionsFile can only be called when: // Persist options to options file. // If need_mutex_lock = false, the method will lock DB mutex. // If need_enter_write_thread = false, the method will enter write thread. Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + Status CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end); + + Status GetApproximateSizesInternal(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes); + // The following two functions can only be called when: // 1. WriteThread::Writer::EnterUnbatched() is used. // 2. db_mutex is NOT held @@ -1241,14 +1327,22 @@ class DBImpl : public DB { virtual bool OwnTablesAndLogs() const { return true; } + // Set DB identity file, and write DB ID to manifest if necessary. + Status SetDBId(bool read_only); + // REQUIRES: db mutex held when calling this function, but the db mutex can // be released and re-acquired. Db mutex will be held when the function // returns. - // After best-efforts recovery, there may be SST files in db/cf paths that are - // not referenced in the MANIFEST. We delete these SST files. In the + // After recovery, there may be SST files in db/cf paths that are + // not referenced in the MANIFEST (e.g. + // 1. It's best effort recovery; + // 2. The VersionEdits referencing the SST files are appended to + // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are + // still not synced to MANIFEST during recovery.) + // We delete these SST files. In the // meantime, we find out the largest file number present in the paths, and // bump up the version set's next_file_number_ to be 1 + largest_file_number. - Status FinishBestEffortsRecovery(); + Status DeleteUnreferencedSstFiles(); // SetDbSessionId() should be called in the constuctor DBImpl() // to ensure that db_session_id_ gets updated every time the DB is opened @@ -1306,6 +1400,7 @@ class DBImpl : public DB { struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + LogFileNumberSize() {} void AddSize(uint64_t new_size) { size += new_size; } uint64_t number; uint64_t size = 0; @@ -1386,15 +1481,16 @@ class DBImpl : public DB { uint32_t output_path_id; Status status; bool done; - bool in_progress; // compaction request being processed? - bool incomplete; // only part of requested range compacted - bool exclusive; // current behavior of only one manual - bool disallow_trivial_move; // Force actual compaction to run - const InternalKey* begin; // nullptr means beginning of key range - const InternalKey* end; // nullptr means end of key range - InternalKey* manual_end; // how far we are compacting - InternalKey tmp_storage; // Used to keep track of compaction progress - InternalKey tmp_storage1; // Used to keep track of compaction progress + bool in_progress; // compaction request being processed? + bool incomplete; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey* manual_end; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + std::atomic* canceled; // Compaction canceled by the user? }; struct PrepickedCompaction { // background compaction takes ownership of `compaction`. @@ -1411,6 +1507,7 @@ class DBImpl : public DB { DBImpl* db; // background compaction takes ownership of `prepicked_compaction`. PrepickedCompaction* prepicked_compaction; + Env::Priority compaction_pri_; }; // Initialize the built-in column family for persistent stats. Depending on @@ -1505,6 +1602,12 @@ class DBImpl : public DB { Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); + // Get the size of a log file and, if truncate is true, truncate the + // log file to its actual size, thereby freeing preallocated space. + // Return success even if truncate fails + Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log); + // Restore alive_log_files_ and total_log_size_ after recovery. // It needs to run only when there's no flush during recovery // (e.g. avoid_flush_during_recovery=true). May also trigger flush @@ -1515,6 +1618,10 @@ class DBImpl : public DB { // `num_bytes` going through. Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); + // Begin stalling of writes when memory usage increases beyond a certain + // threshold. + void WriteBufferManagerStallWrites(); + Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, WriteBatch* my_batch); @@ -1594,7 +1701,7 @@ class DBImpl : public DB { Status SwitchWAL(WriteContext* write_context); // REQUIRES: mutex locked and in write thread. - Status HandleWriteBufferFull(WriteContext* write_context); + Status HandleWriteBufferManagerFlush(WriteContext* write_context); // REQUIRES: mutex locked Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync, @@ -1728,7 +1835,9 @@ class DBImpl : public DB { std::unique_ptr* token, LogBuffer* log_buffer); // helper function to call after some of the logs_ were synced - void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status); + Status MarkLogsSynced(uint64_t up_to, bool synced_dir); + // WALs with log number up to up_to are not synced successfully. + void MarkLogsNotSynced(uint64_t up_to); SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary, bool lock = true); @@ -1867,13 +1976,11 @@ class DBImpl : public DB { Status MultiGetImpl( const ReadOptions& read_options, size_t start_key, size_t num_keys, autovector* sorted_keys, - SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, - bool* is_blob_index); + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback); Status DisableFileDeletionsWithLock(); - // table_cache_ provides its own synchronization - std::shared_ptr table_cache_; + Status IncreaseFullHistoryTsLow(ColumnFamilyData* cfd, std::string ts_low); // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; @@ -1888,8 +1995,6 @@ class DBImpl : public DB { // mutex_, the order should be first mutex_ and then log_write_mutex_. InstrumentedMutex log_write_mutex_; - std::atomic shutting_down_; - // If zero, manual compactions are allowed to proceed. If non-zero, manual // compactions may still be running, but will quickly fail with // `Status::Incomplete`. The value indicates how many threads have paused @@ -2098,10 +2203,6 @@ class DBImpl : public DB { // Number of threads intending to write to memtable std::atomic pending_memtable_writes_ = {}; - // Each flush or compaction gets its own job id. this counter makes sure - // they're unique - std::atomic next_job_id_; - // A flag indicating whether the current rocksdb database has any // data that is not yet persisted into either WAL or SST file. // Used when disableWAL is true. @@ -2130,9 +2231,6 @@ class DBImpl : public DB { WalManager wal_manager_; #endif // ROCKSDB_LITE - // Unified interface for logging events - EventLogger event_logger_; - // A value of > 0 temporarily disables scheduling of background work int bg_work_paused_; @@ -2200,8 +2298,6 @@ class DBImpl : public DB { // Flag to check whether Close() has been called on this DB bool closed_; - ErrorHandler error_handler_; - // Conditional variable to coordinate installation of atomic flush results. // With atomic flush, each bg thread installs the result of flushing multiple // column families, and different threads can flush different column @@ -2226,11 +2322,18 @@ class DBImpl : public DB { LogBuffer* log_buffer, PluggableCompactionResult* result); bool wal_in_db_path_; + + BlobFileCompletionCallback blob_callback_; + + // Pointer to WriteBufferManager stalling interface. + std::unique_ptr wbm_stall_; }; -extern Options SanitizeOptions(const std::string& db, const Options& src); +extern Options SanitizeOptions(const std::string& db, const Options& src, + bool read_only = false); -extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); +extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, + bool read_only = false); extern CompressionType GetCompressionFlush( const ImmutableCFOptions& ioptions, @@ -2242,11 +2345,27 @@ extern CompressionType GetCompressionFlush( // `memtables_to_flush`) will be flushed and thus will not depend on any WAL // file. // The function is only applicable to 2pc mode. -extern uint64_t PrecomputeMinLogNumberToKeep( +extern uint64_t PrecomputeMinLogNumberToKeep2PC( VersionSet* vset, const ColumnFamilyData& cfd_to_flush, - autovector edit_list, + const autovector& edit_list, const autovector& memtables_to_flush, LogsWithPrepTracker* prep_tracker); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker); + +// In non-2PC mode, WALs with log number < the returned number can be +// deleted after the cfd_to_flush column family is flushed successfully. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list); +// For atomic flush. +extern uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists); // `cfd_to_flush` is the column family whose memtable will be flushed and thus // will not depend on any WAL file. nullptr means no memtable is being flushed. @@ -2254,6 +2373,10 @@ extern uint64_t PrecomputeMinLogNumberToKeep( extern uint64_t FindMinPrepLogReferencedByMemTable( VersionSet* vset, const ColumnFamilyData* cfd_to_flush, const autovector& memtables_to_flush); +// For atomic flush. +extern uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector*>& memtables_to_flush); // Fix user-supplied options to be reasonable template diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index cce44f8626f..75571e96e77 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -35,8 +35,10 @@ bool DBImpl::EnoughRoomForCompaction( // Pass the current bg_error_ to SFM so it can decide what checks to // perform. If this DB instance hasn't seen any error yet, the SFM can be // optimistic and not do disk space checks - enough_room = - sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError()); + Status bg_error = error_handler_.GetBGError(); + enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error); + bg_error.PermitUncheckedError(); // bg_error is just a copy of the Status + // from the error_handler_ if (enough_room) { *sfm_reserved_compact_space = true; } @@ -123,20 +125,17 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) { // "number <= current_log_number - 1" is equivalent to // "number < current_log_number". - MarkLogsSynced(current_log_number - 1, true, io_s); + if (io_s.ok()) { + io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true)); + } else { + MarkLogsNotSynced(current_log_number - 1); + } if (!io_s.ok()) { - if (total_log_size_ > 0) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); - } else { - // If the WAL is empty, we use different error reason - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL) - .PermitUncheckedError(); - } TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); return io_s; } } + TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end"); return io_s; } @@ -155,27 +154,25 @@ Status DBImpl::FlushMemTableToOutputFile( FlushJob flush_job( dbname_, cfd, immutable_db_options_, mutable_cf_options, - nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(), - &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), - GetDataDir(cfd, 0U), + port::kMaxUint64 /* memtable_id */, file_options_for_compaction_, + versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, + earliest_write_conflict_snapshot, snapshot_checker, job_context, + log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, true /* sync_output_directory */, true /* write_manifest */, thread_pri, - io_tracer_, db_id_, db_session_id_); + io_tracer_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(), + &blob_callback_); FileMetaData file_meta; - TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); - flush_job.PickMemTable(); - TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables"); - #ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); #endif // ROCKSDB_LITE Status s; - IOStatus io_s = IOStatus::OK(); + bool need_cancel = false; + IOStatus log_io_s = IOStatus::OK(); if (logfile_number_ > 0 && versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) { // If there are more than one column families, we need to make sure that @@ -184,16 +181,24 @@ Status DBImpl::FlushMemTableToOutputFile( // flushed SST may contain data from write batches whose updates to // other column families are missing. // SyncClosedLogs() may unlock and re-lock the db_mutex. - io_s = SyncClosedLogs(job_context); - s = io_s; - if (!io_s.ok() && !io_s.IsShutdownInProgress() && - !io_s.IsColumnFamilyDropped()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + log_io_s = SyncClosedLogs(job_context); + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); } } else { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); } + s = log_io_s; + + // If the log sync failed, we do not need to pick memtable. Otherwise, + // num_flush_not_started_ needs to be rollback. + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); + if (s.ok()) { + flush_job.PickMemTable(); + need_cancel = true; + } + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables"); // Within flush_job.Run, rocksdb may call event listener to notify // file creation and deletion. @@ -203,11 +208,16 @@ Status DBImpl::FlushMemTableToOutputFile( // is unlocked by the current thread. if (s.ok()) { s = flush_job.Run(&logs_with_prep_tracker_, &file_meta); - } else { + need_cancel = false; + } + + if (!s.ok() && need_cancel) { flush_job.Cancel(); } - if (io_s.ok()) { - io_s = flush_job.io_status(); + IOStatus io_s = IOStatus::OK(); + io_s = flush_job.io_status(); + if (s.ok()) { + s = io_s; } if (s.ok()) { @@ -243,30 +253,30 @@ Status DBImpl::FlushMemTableToOutputFile( if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { if (!io_s.ok() && !io_s.IsShutdownInProgress() && !io_s.IsColumnFamilyDropped()) { + assert(log_io_s.ok()); // Error while writing to MANIFEST. // In fact, versions_->io_status() can also be the result of renaming // CURRENT file. With current code, it's just difficult to tell. So just // be pessimistic and try write to a new MANIFEST. // TODO: distinguish between MANIFEST write and CURRENT renaming if (!versions_->io_status().ok()) { - // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite) - .PermitUncheckedError(); - } else if (total_log_size_ > 0) { - // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is + // needed. + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWriteNoWAL); } else { - // If the WAL is empty, we use different error reason - // Should handle return error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL) - .PermitUncheckedError(); + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); } } else { - Status new_bg_error = s; - // Should handle return error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + if (log_io_s.ok()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } } } else { // If we got here, then we decided not to care about the i_os status (either @@ -284,16 +294,17 @@ Status DBImpl::FlushMemTableToOutputFile( // Notify sst_file_manager that a new file was added std::string file_path = MakeTableFileName( cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber()); - sfm->OnAddFile(file_path); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); if (sfm->IsMaxAllowedSpaceReached()) { Status new_bg_error = Status::SpaceLimit("Max allowed space was reached"); TEST_SYNC_POINT_CALLBACK( "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", &new_bg_error); - // Should handle this error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } } #endif // ROCKSDB_LITE @@ -309,30 +320,22 @@ Status DBImpl::FlushMemTablesToOutputFiles( return AtomicFlushMemTablesToOutputFiles( bg_flush_args, made_progress, job_context, log_buffer, thread_pri); } + assert(bg_flush_args.size() == 1); std::vector snapshot_seqs; SequenceNumber earliest_write_conflict_snapshot; SnapshotChecker* snapshot_checker; GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); - Status status; - for (auto& arg : bg_flush_args) { - ColumnFamilyData* cfd = arg.cfd_; - MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); - SuperVersionContext* superversion_context = arg.superversion_context_; - Status s = FlushMemTableToOutputFile( - cfd, mutable_cf_options, made_progress, job_context, - superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, log_buffer, thread_pri); - if (!s.ok()) { - status = s; - if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { - // At this point, DB is not shutting down, nor is cfd dropped. - // Something is wrong, thus we break out of the loop. - break; - } - } - } - return status; + const auto& bg_flush_arg = bg_flush_args[0]; + ColumnFamilyData* cfd = bg_flush_arg.cfd_; + MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + SuperVersionContext* superversion_context = + bg_flush_arg.superversion_context_; + Status s = FlushMemTableToOutputFile( + cfd, mutable_cf_options, made_progress, job_context, superversion_context, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + log_buffer, thread_pri); + return s; } /* @@ -395,7 +398,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); - const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_); + uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_; jobs.emplace_back(new FlushJob( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, @@ -404,13 +407,13 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, false /* sync_output_directory */, false /* write_manifest */, - thread_pri, io_tracer_, db_id_, db_session_id_)); - jobs.back()->PickMemTable(); + thread_pri, io_tracer_, db_id_, db_session_id_, + cfd->GetFullHistoryTsLow())); } std::vector file_meta(num_cfs); Status s; - IOStatus io_s; + IOStatus log_io_s = IOStatus::OK(); assert(num_cfs == static_cast(jobs.size())); #ifndef ROCKSDB_LITE @@ -425,18 +428,36 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. - io_s = SyncClosedLogs(job_context); - s = io_s; + log_io_s = SyncClosedLogs(job_context); + if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && + !log_io_s.IsColumnFamilyDropped()) { + if (total_log_size_ > 0) { + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush); + } else { + // If the WAL is empty, we use different error reason + error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL); + } + } } + s = log_io_s; // exec_status stores the execution status of flush_jobs as // autovector> exec_status; autovector io_status; + std::vector pick_status; for (int i = 0; i != num_cfs; ++i) { // Initially all jobs are not executed, with status OK. exec_status.emplace_back(false, Status::OK()); io_status.emplace_back(IOStatus::OK()); + pick_status.push_back(false); + } + + if (s.ok()) { + for (int i = 0; i != num_cfs; ++i) { + jobs[i]->PickMemTable(); + pick_status[i] = true; + } } if (s.ok()) { @@ -477,6 +498,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( s = error_status.ok() ? s : error_status; } + IOStatus io_s = IOStatus::OK(); if (io_s.ok()) { IOStatus io_error = IOStatus::OK(); for (int i = 0; i != static_cast(io_status.size()); i++) { @@ -512,12 +534,12 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // Have to cancel the flush jobs that have NOT executed because we need to // unref the versions. for (int i = 0; i != num_cfs; ++i) { - if (!exec_status[i].first) { + if (pick_status[i] && !exec_status[i].first) { jobs[i]->Cancel(); } } for (int i = 0; i != num_cfs; ++i) { - if (exec_status[i].first && exec_status[i].second.ok()) { + if (exec_status[i].second.ok() && exec_status[i].first) { auto& mems = jobs[i]->GetMemTables(); cfds[i]->imm()->RollbackMemtableFlush(mems, file_meta[i].fd.GetNumber()); @@ -581,7 +603,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, - versions_.get(), &mutex_, tmp_file_meta, + versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta, &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer); } @@ -627,7 +649,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); assert(all_mutable_cf_options.size() == static_cast(num_cfs)); - for (int i = 0; i != num_cfs; ++i) { + for (int i = 0; s.ok() && i != num_cfs; ++i) { if (cfds[i]->IsDropped()) { continue; } @@ -636,14 +658,16 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (sfm) { std::string file_path = MakeTableFileName( cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); - sfm->OnAddFile(file_path); + // TODO (PR7798). We should only add the file to the FileManager if it + // exists. Otherwise, some tests may fail. Ignore the error in the + // interim. + sfm->OnAddFile(file_path).PermitUncheckedError(); if (sfm->IsMaxAllowedSpaceReached() && error_handler_.GetBGError().ok()) { Status new_bg_error = Status::SpaceLimit("Max allowed space was reached"); - // Should Handle this error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + error_handler_.SetBGError(new_bg_error, + BackgroundErrorReason::kFlush); } } } @@ -654,30 +678,30 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // it is not because of CF drop. if (!s.ok() && !s.IsColumnFamilyDropped()) { if (!io_s.ok() && !io_s.IsColumnFamilyDropped()) { + assert(log_io_s.ok()); // Error while writing to MANIFEST. // In fact, versions_->io_status() can also be the result of renaming // CURRENT file. With current code, it's just difficult to tell. So just // be pessimistic and try write to a new MANIFEST. // TODO: distinguish between MANIFEST write and CURRENT renaming if (!versions_->io_status().ok()) { - // Should Handle this error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite) - .PermitUncheckedError(); - } else if (total_log_size_ > 0) { - // Should Handle this error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the Manifest write will be map to soft error. + // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor + // is needed. + error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWriteNoWAL); } else { - // If the WAL is empty, we use different error reason - // Should Handle this error? - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL) - .PermitUncheckedError(); + // If WAL sync is successful (either WAL size is 0 or there is no IO + // error), all the other SST file write errors will be set as + // kFlushNoWAL. + error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL); } } else { - Status new_bg_error = s; - // Should Handle this error? - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush) - .PermitUncheckedError(); + if (log_io_s.ok()) { + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); + } } } @@ -777,7 +801,68 @@ void DBImpl::NotifyOnFlushCompleted( Status DBImpl::CompactRange(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, - const Slice* begin, const Slice* end) { + const Slice* begin_without_ts, + const Slice* end_without_ts) { + if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + if (options.canceled && options.canceled->load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return CompactRangeInternal(options, column_family, begin_without_ts, + end_without_ts); + } + + std::string begin_str; + std::string end_str; + + // CompactRange compact all keys: [begin, end] inclusively. Add maximum + // timestamp to include all `begin` keys, and add minimal timestamp to include + // all `end` keys. + if (begin_without_ts != nullptr) { + AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz); + } + if (end_without_ts != nullptr) { + AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz); + } + Slice begin(begin_str); + Slice end(end_str); + + Slice* begin_with_ts = begin_without_ts ? &begin : nullptr; + Slice* end_with_ts = end_without_ts ? &end : nullptr; + + return CompactRangeInternal(options, column_family, begin_with_ts, + end_with_ts); +} + +Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyData* cfd, + std::string ts_low) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + edit.SetFullHistoryTsLow(ts_low); + + InstrumentedMutexLock l(&mutex_); + std::string current_ts_low = cfd->GetFullHistoryTsLow(); + const Comparator* ucmp = cfd->user_comparator(); + if (!current_ts_low.empty() && + ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) { + return Status::InvalidArgument( + "Cannot decrease full_history_timestamp_low"); + } + + return versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_); +} + +Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, + ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end) { auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); @@ -786,18 +871,36 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, } bool flush_needed = true; + + // Update full_history_ts_low if it's set + if (options.full_history_ts_low != nullptr && + !options.full_history_ts_low->empty()) { + std::string ts_low = options.full_history_ts_low->ToString(); + if (begin != nullptr || end != nullptr) { + return Status::InvalidArgument( + "Cannot specify compaction range with full_history_ts_low"); + } + Status s = IncreaseFullHistoryTsLow(cfd, ts_low); + if (!s.ok()) { + LogFlush(immutable_db_options_.info_log); + return s; + } + } + + Status s; if (begin != nullptr && end != nullptr) { // TODO(ajkr): We could also optimize away the flush in certain cases where // one/both sides of the interval are unbounded. But it requires more // changes to RangesOverlapWithMemtables. Range range(*begin, *end); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); - cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed); + s = cfd->RangesOverlapWithMemtables( + {range}, super_version, immutable_db_options_.allow_data_in_errors, + &flush_needed); CleanupSuperVersion(super_version); } - Status s; - if (flush_needed) { + if (s.ok() && flush_needed) { FlushOptions fo; fo.allow_write_stall = options.allow_write_stall; if (immutable_db_options_.atomic_flush) { @@ -993,7 +1096,7 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, assert(cfd); Status s; - JobContext job_context(0, true); + JobContext job_context(next_job_id_.fetch_add(1), true); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); @@ -1144,16 +1247,18 @@ Status DBImpl::CompactFilesImpl( assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJobStats compaction_job_stats; CompactionJob compaction_job( - job_context->job_id, c.get(), immutable_db_options_, + job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_, file_options_for_compaction_, versions_.get(), &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), - GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_, - &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, table_cache_, &event_logger_, + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, &compaction_job_stats, Env::Priority::USER, io_tracer_, - &manual_compaction_paused_, db_id_, db_session_id_); + &manual_compaction_paused_, nullptr, db_id_, db_session_id_, + c->column_family_data()->GetFullHistoryTsLow()); // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already @@ -1167,7 +1272,8 @@ Status DBImpl::CompactFilesImpl( mutex_.Unlock(); TEST_SYNC_POINT("CompactFilesImpl:0"); TEST_SYNC_POINT("CompactFilesImpl:1"); - compaction_job.Run(); + // Ignore the status here, as it will be checked in the Install down below... + compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("CompactFilesImpl:2"); TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); @@ -1217,18 +1323,16 @@ Status DBImpl::CompactFilesImpl( job_context->job_id, status.ToString().c_str()); IOStatus io_s = compaction_job.io_status(); if (!io_s.ok()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); } else { - error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); } } if (output_file_names != nullptr) { for (const auto& newf : c->edit()->GetNewFiles()) { (*output_file_names) - .push_back(TableFileName(c->immutable_cf_options()->cf_paths, + .push_back(TableFileName(c->immutable_options()->cf_paths, newf.second.fd.GetNumber(), newf.second.fd.GetPathId())); } @@ -1326,10 +1430,13 @@ void DBImpl::NotifyOnCompactionCompleted( if (shutting_down_.load(std::memory_order_acquire)) { return; } + // TODO: Should disabling manual compaction squash compaction completed + // notifications that aren't the result of a shutdown? if (c->is_manual_compaction() && manual_compaction_paused_.load(std::memory_order_acquire) > 0) { return; } + Version* current = cfd->current(); current->Ref(); // release lock while notifying events @@ -1366,8 +1473,6 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { SuperVersionContext sv_context(/* create_superversion */ true); - Status status; - InstrumentedMutexLock guard_lock(&mutex_); // only allow one thread refitting @@ -1431,8 +1536,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, - directories_.GetDbDir()); + Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, + &mutex_, directories_.GetDbDir()); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", @@ -1443,12 +1549,14 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { "[%s] After refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); } + sv_context.Clean(); + refitting_level_ = false; + + return status; } - sv_context.Clean(); refitting_level_ = false; - - return status; + return Status::OK(); } int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { @@ -1553,6 +1661,7 @@ Status DBImpl::RunManualCompaction( manual.incomplete = false; manual.exclusive = exclusive; manual.disallow_trivial_move = disallow_trivial_move; + manual.canceled = compact_range_options.canceled; // For universal compaction, we enforce every manual compaction to compact // all files. if (begin == nullptr || @@ -1644,6 +1753,7 @@ Status DBImpl::RunManualCompaction( } ca = new CompactionArg; ca->db = this; + ca->compaction_pri_ = Env::Priority::LOW; ca->prepicked_compaction = new PrepickedCompaction; ca->prepicked_compaction->manual_compaction_state = &manual; ca->prepicked_compaction->compaction = compaction; @@ -1690,8 +1800,9 @@ void DBImpl::GenerateFlushRequest(const autovector& cfds, Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& flush_options, FlushReason flush_reason, bool writes_stopped) { + // This method should not be called if atomic_flush is true. + assert(!immutable_db_options_.atomic_flush); Status s; - uint64_t flush_memtable_id = 0; if (!flush_options.allow_write_stall) { bool flush_needed = true; s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); @@ -1701,7 +1812,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } } - FlushRequest flush_req; + autovector flush_reqs; + autovector memtable_ids_to_wait; { WriteContext context; InstrumentedMutexLock guard_lock(&mutex_); @@ -1716,18 +1828,24 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } WaitForPendingWrites(); - if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { - if (flush_reason != FlushReason::kErrorRecoveryRetryFlush) { - s = SwitchMemtable(cfd, &context); - } else { - assert(cfd->imm()->NumNotFlushed() > 0); - } + if (flush_reason != FlushReason::kErrorRecoveryRetryFlush && + (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) { + // Note that, when flush reason is kErrorRecoveryRetryFlush, during the + // auto retry resume, we want to avoid creating new small memtables. + // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl + // will iterate through all the CFs and call FlushMemtable during auto + // retry resume, it is possible that in some CFs, + // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will + // be created and scheduled, status::OK() will be returned. + s = SwitchMemtable(cfd, &context); } + const uint64_t flush_memtable_id = port::kMaxUint64; if (s.ok()) { if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { - flush_memtable_id = cfd->imm()->GetLatestMemTableID(); - flush_req.emplace_back(cfd, flush_memtable_id); + FlushRequest req{{cfd, flush_memtable_id}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID()); } if (immutable_db_options_.persist_stats_to_disk && flush_reason != FlushReason::kErrorRecoveryRetryFlush) { @@ -1753,15 +1871,19 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, "to avoid holding old logs", cfd->GetName().c_str()); s = SwitchMemtable(cfd_stats, &context); - flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID(); - flush_req.emplace_back(cfd_stats, flush_memtable_id); + FlushRequest req{{cfd_stats, flush_memtable_id}}; + flush_reqs.emplace_back(std::move(req)); + memtable_ids_to_wait.emplace_back( + cfd->imm()->GetLatestMemTableID()); } } } } - if (s.ok() && !flush_req.empty()) { - for (auto& elem : flush_req) { - ColumnFamilyData* loop_cfd = elem.first; + + if (s.ok() && !flush_reqs.empty()) { + for (const auto& req : flush_reqs) { + assert(req.size() == 1); + ColumnFamilyData* loop_cfd = req[0].first; loop_cfd->imm()->FlushRequested(); } // If the caller wants to wait for this flush to complete, it indicates @@ -1769,12 +1891,15 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, // other threads which may drop the column family concurrently. // Therefore, we increase the cfd's ref count. if (flush_options.wait) { - for (auto& elem : flush_req) { - ColumnFamilyData* loop_cfd = elem.first; + for (const auto& req : flush_reqs) { + assert(req.size() == 1); + ColumnFamilyData* loop_cfd = req[0].first; loop_cfd->Ref(); } } - SchedulePendingFlush(flush_req, flush_reason); + for (const auto& req : flush_reqs) { + SchedulePendingFlush(req, flush_reason); + } MaybeScheduleFlushOrCompaction(); } @@ -1790,9 +1915,11 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, if (s.ok() && flush_options.wait) { autovector cfds; autovector flush_memtable_ids; - for (auto& iter : flush_req) { - cfds.push_back(iter.first); - flush_memtable_ids.push_back(&(iter.second)); + assert(flush_reqs.size() == memtable_ids_to_wait.size()); + for (size_t i = 0; i < flush_reqs.size(); ++i) { + assert(flush_reqs[i].size() == 1); + cfds.push_back(flush_reqs[i][0].first); + flush_memtable_ids.push_back(&(memtable_ids_to_wait[i])); } s = WaitForFlushMemTables( cfds, flush_memtable_ids, @@ -1974,12 +2101,12 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, // check whether one extra immutable memtable or an extra L0 file would // cause write stalling mode to be entered. It could still enter stall // mode due to pending compaction bytes, but that's less common - write_stall_condition = - ColumnFamilyData::GetWriteStallConditionAndCause( - cfd->imm()->NumNotFlushed() + 1, - vstorage->l0_delay_trigger_count() + 1, - vstorage->estimated_compaction_needed_bytes(), mutable_cf_options) - .first; + write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + 1, + vstorage->l0_delay_trigger_count() + 1, + vstorage->estimated_compaction_needed_bytes(), + mutable_cf_options, *cfd->ioptions()) + .first; } while (write_stall_condition != WriteStallCondition::kNormal); } return Status::OK(); @@ -2158,6 +2285,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { unscheduled_compactions_ > 0) { CompactionArg* ca = new CompactionArg; ca->db = this; + ca->compaction_pri_ = Env::Priority::LOW; ca->prepicked_compaction = nullptr; bg_compaction_scheduled_++; unscheduled_compactions_--; @@ -2217,6 +2345,17 @@ DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { assert(!flush_queue_.empty()); FlushRequest flush_req = flush_queue_.front(); flush_queue_.pop_front(); + if (!immutable_db_options_.atomic_flush) { + assert(flush_req.size() == 1); + } + for (const auto& elem : flush_req) { + if (!immutable_db_options_.atomic_flush) { + ColumnFamilyData* cfd = elem.first; + assert(cfd); + assert(cfd->queued_for_flush()); + cfd->set_queued_for_flush(false); + } + } // TODO: need to unset flush reason? return flush_req; } @@ -2249,19 +2388,36 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue( void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, FlushReason flush_reason) { + mutex_.AssertHeld(); if (flush_req.empty()) { return; } - for (auto& iter : flush_req) { - ColumnFamilyData* cfd = iter.first; - cfd->Ref(); - cfd->SetFlushReason(flush_reason); + if (!immutable_db_options_.atomic_flush) { + // For the non-atomic flush case, we never schedule multiple column + // families in the same flush request. + assert(flush_req.size() == 1); + ColumnFamilyData* cfd = flush_req[0].first; + assert(cfd); + if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) { + cfd->Ref(); + cfd->set_queued_for_flush(true); + cfd->SetFlushReason(flush_reason); + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); + } + } else { + for (auto& iter : flush_req) { + ColumnFamilyData* cfd = iter.first; + cfd->Ref(); + cfd->SetFlushReason(flush_reason); + } + ++unscheduled_flushes_; + flush_queue_.push_back(flush_req); } - ++unscheduled_flushes_; - flush_queue_.push_back(flush_req); } void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + mutex_.AssertHeld(); if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { AddToCompactionQueue(cfd); ++unscheduled_compactions_; @@ -2317,7 +2473,16 @@ void DBImpl::BGWorkPurge(void* db) { } void DBImpl::UnscheduleCompactionCallback(void* arg) { - CompactionArg ca = *(reinterpret_cast(arg)); + CompactionArg* ca_ptr = reinterpret_cast(arg); + Env::Priority compaction_pri = ca_ptr->compaction_pri_; + if (Env::Priority::BOTTOM == compaction_pri) { + // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM + ca_ptr->db->bg_bottom_compaction_scheduled_--; + } else if (Env::Priority::LOW == compaction_pri) { + // Decrement bg_compaction_scheduled_ if priority is LOW + ca_ptr->db->bg_compaction_scheduled_--; + } + CompactionArg ca = *(ca_ptr); delete reinterpret_cast(arg); if (ca.prepicked_compaction != nullptr) { if (ca.prepicked_compaction->compaction != nullptr) { @@ -2329,6 +2494,14 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) { } void DBImpl::UnscheduleFlushCallback(void* arg) { + // Decrement bg_flush_scheduled_ in flush callback + reinterpret_cast(arg)->db_->bg_flush_scheduled_--; + Env::Priority flush_pri = reinterpret_cast(arg)->thread_pri_; + if (Env::Priority::LOW == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback"); + } else if (Env::Priority::HIGH == flush_pri) { + TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback"); + } delete reinterpret_cast(arg); TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback"); } @@ -2421,6 +2594,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1"); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2"); { InstrumentedMutexLock l(&mutex_); assert(bg_flush_scheduled_); @@ -2449,7 +2624,7 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); LogFlush(immutable_db_options_.info_log); - env_->SleepForMicroseconds(1000000); + immutable_db_options_.clock->SleepForMicroseconds(1000000); mutex_.Lock(); } @@ -2522,7 +2697,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, if (s.IsBusy()) { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); - env_->SleepForMicroseconds(10000); // prevent hot loop + immutable_db_options_.clock->SleepForMicroseconds( + 10000); // prevent hot loop mutex_.Lock(); } else if (!s.ok() && !s.IsShutdownInProgress() && !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { @@ -2540,7 +2716,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); LogFlush(immutable_db_options_.info_log); - env_->SleepForMicroseconds(1000000); + immutable_db_options_.clock->SleepForMicroseconds(1000000); mutex_.Lock(); } else if (s.IsManualCompactionPaused()) { ManualCompactionState* m = prepicked_compaction->manual_compaction_state; @@ -2556,7 +2732,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, // failure). Thus, we force full scan in FindObsoleteFiles() FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && !s.IsManualCompactionPaused() && - !s.IsColumnFamilyDropped()); + !s.IsColumnFamilyDropped() && + !s.IsBusy()); TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); // delete unnecessary files if any, this is done outside the mutex @@ -2590,6 +2767,14 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, // See if there's more work to be done MaybeScheduleFlushOrCompaction(); + + if (prepicked_compaction != nullptr && + prepicked_compaction->task_token != nullptr) { + // Releasing task tokens affects the DB state, so must be done before we + // potentially signal the DB close process to proceed below. + prepicked_compaction->task_token->ReleaseOnce(); + } + if (made_progress || (bg_compaction_scheduled_ == 0 && bg_bottom_compaction_scheduled_ == 0) || @@ -2642,6 +2827,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } else if (is_manual && manual_compaction_paused_.load(std::memory_order_acquire) > 0) { status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } else if (is_manual && manual_compaction->canceled && + manual_compaction->canceled->load(std::memory_order_acquire)) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } } else { status = error_handler_.GetBGError(); @@ -2769,7 +2957,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->column_family_data() ->current() ->storage_info() - ->ComputeCompactionScore(*(c->immutable_cf_options()), + ->ComputeCompactionScore(*(c->immutable_options()), *(c->mutable_cf_options())); AddToCompactionQueue(cfd); ++unscheduled_compactions_; @@ -2929,6 +3117,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool"); CompactionArg* ca = new CompactionArg; ca->db = this; + ca->compaction_pri_ = Env::Priority::BOTTOM; ca->prepicked_compaction = new PrepickedCompaction; ca->prepicked_compaction->compaction = c.release(); ca->prepicked_compaction->manual_compaction_state = nullptr; @@ -2952,16 +3141,19 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, - file_options_for_compaction_, versions_.get(), &shutting_down_, - preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), - GetDataDir(c->column_family_data(), c->output_path_id()), stats_, - &mutex_, &error_handler_, snapshot_seqs, - earliest_write_conflict_snapshot, snapshot_checker, table_cache_, - &event_logger_, c->mutable_cf_options()->paranoid_file_checks, + mutable_db_options_, file_options_for_compaction_, versions_.get(), + &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer, + directories_.GetDbDir(), + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, + &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot, + snapshot_checker, table_cache_, &event_logger_, + c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, &compaction_job_stats, thread_pri, io_tracer_, - is_manual ? &manual_compaction_paused_ : nullptr, db_id_, - db_session_id_); + is_manual ? &manual_compaction_paused_ : nullptr, + is_manual ? manual_compaction->canceled : nullptr, db_id_, + db_session_id_, c->column_family_data()->GetFullHistoryTsLow()); compaction_job.Prepare(); NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, @@ -3036,10 +3228,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, auto err_reason = versions_->io_status().ok() ? BackgroundErrorReason::kCompaction : BackgroundErrorReason::kManifestWrite; - error_handler_.SetBGError(io_s, err_reason).PermitUncheckedError(); + error_handler_.SetBGError(io_s, err_reason); } else { - error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); } if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) { // Put this cfd back in the compaction queue so we can retry after some @@ -3051,7 +3242,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->column_family_data() ->current() ->storage_info() - ->ComputeCompactionScore(*(c->immutable_cf_options()), + ->ComputeCompactionScore(*(c->immutable_options()), *(c->mutable_cf_options())); if (!cfd->queued_for_compaction()) { AddToCompactionQueue(cfd); @@ -3192,7 +3383,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { if (m->cfd != m1->cfd) { return false; } - return true; + return false; } #ifndef ROCKSDB_LITE @@ -3216,7 +3407,7 @@ void DBImpl::BuildCompactionJobInfo( for (const auto fmd : *c->inputs(i)) { const FileDescriptor& desc = fmd->fd; const uint64_t file_number = desc.GetNumber(); - auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number, + auto fn = TableFileName(c->immutable_options()->cf_paths, file_number, desc.GetPathId()); compaction_job_info->input_files.push_back(fn); compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ @@ -3235,7 +3426,7 @@ void DBImpl::BuildCompactionJobInfo( const FileDescriptor& desc = meta.fd; const uint64_t file_number = desc.GetNumber(); compaction_job_info->output_files.push_back(TableFileName( - c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + c->immutable_options()->cf_paths, file_number, desc.GetPathId())); compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ newf.first, file_number, meta.oldest_blob_file_number}); } diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index c21c9fa8f18..e590607c6d7 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -22,12 +22,13 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() { return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); } -void DBImpl::TEST_SwitchWAL() { +Status DBImpl::TEST_SwitchWAL() { WriteContext write_context; InstrumentedMutexLock l(&mutex_); void* writer = TEST_BeginWrite(); - SwitchWAL(&write_context); + auto s = SwitchWAL(&write_context); TEST_EndWrite(writer); + return s; } bool DBImpl::TEST_WALBufferIsEmpty(bool lock) { @@ -170,12 +171,17 @@ Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || bg_flush_scheduled_ || (wait_unscheduled && unscheduled_compactions_)) && - (error_handler_.GetBGError() == Status::OK())) { + (error_handler_.GetBGError().ok())) { bg_cv_.Wait(); } return error_handler_.GetBGError(); } +Status DBImpl::TEST_GetBGError() { + InstrumentedMutexLock l(&mutex_); + return error_handler_.GetBGError(); +} + void DBImpl::TEST_LockMutex() { mutex_.Lock(); } void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 2d30f5857b9..c0405d6bf48 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -6,16 +6,17 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl/db_impl.h" - #include #include #include + +#include "db/db_impl/db_impl.h" #include "db/event_helpers.h" #include "db/memtable_list.h" #include "file/file_util.h" #include "file/filename.h" #include "file/sst_file_manager_impl.h" +#include "port/port.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -119,7 +120,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, mutable_db_options_.delete_obsolete_files_period_micros == 0) { doing_the_full_scan = true; } else { - const uint64_t now_micros = env_->NowMicros(); + const uint64_t now_micros = immutable_db_options_.clock->NowMicros(); if ((delete_obsolete_files_last_run_ + mutable_db_options_.delete_obsolete_files_period_micros) < now_micros) { @@ -190,7 +191,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - env_->GetChildren(path, &files).PermitUncheckedError(); // Ignore errors + Status s = env_->GetChildren(path, &files); + s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& file : files) { uint64_t number; FileType type; @@ -206,7 +208,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, continue; } - // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes + // TODO(icanadi) clean up this mess to avoid having one-off "/" + // prefixes job_context->full_scan_candidate_files.emplace_back("/" + file, path); } } @@ -214,9 +217,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // Add log files in wal_dir if (immutable_db_options_.wal_dir != dbname_) { std::vector log_files; - env_->GetChildren(immutable_db_options_.wal_dir, - &log_files) - .PermitUncheckedError(); // Ignore errors + Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files); + s.PermitUncheckedError(); // TODO: What should we do on error? for (const std::string& log_file : log_files) { job_context->full_scan_candidate_files.emplace_back( log_file, immutable_db_options_.wal_dir); @@ -226,9 +228,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, if (!immutable_db_options_.db_log_dir.empty() && immutable_db_options_.db_log_dir != dbname_) { std::vector info_log_files; - // Ignore errors - env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files) - .PermitUncheckedError(); + Status s = + env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files); + s.PermitUncheckedError(); // TODO: What should we do on error? for (std::string& log_file : info_log_files) { job_context->full_scan_candidate_files.emplace_back( log_file, immutable_db_options_.db_log_dir); @@ -319,7 +321,7 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, const_cast(&fname)); Status file_deletion_status; - if (type == kTableFile || type == kBlobFile || type == kLogFile) { + if (type == kTableFile || type == kBlobFile || type == kWalFile) { file_deletion_status = DeleteDBFile(&immutable_db_options_, fname, path_to_sync, /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_); @@ -466,7 +468,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { bool keep = true; switch (type) { - case kLogFile: + case kWalFile: keep = ((number >= state.log_number) || (number == state.prev_log_number) || (log_recycle_files_set.find(number) != @@ -546,7 +548,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { dir_to_sync = candidate_file.file_path; } else { dir_to_sync = - (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_; + (type == kWalFile) ? immutable_db_options_.wal_dir : dbname_; fname = dir_to_sync + ((!dir_to_sync.empty() && dir_to_sync.back() == '/') || (!to_delete.empty() && to_delete.front() == '/') @@ -556,8 +558,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { } #ifndef ROCKSDB_LITE - if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 || - immutable_db_options_.wal_size_limit_mb > 0)) { + if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 || + immutable_db_options_.WAL_size_limit_MB > 0)) { wal_manager_.ArchiveWALFile(fname, number); continue; } @@ -658,13 +660,15 @@ uint64_t FindMinPrepLogReferencedByMemTable( // we must look through the memtables for two phase transactions // that have been committed but not yet flushed + std::unordered_set memtables_to_flush_set( + memtables_to_flush.begin(), memtables_to_flush.end()); for (auto loop_cfd : *vset->GetColumnFamilySet()) { if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) { continue; } auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( - memtables_to_flush); + &memtables_to_flush_set); if (log > 0 && (min_log == 0 || log < min_log)) { min_log = log; @@ -680,16 +684,41 @@ uint64_t FindMinPrepLogReferencedByMemTable( return min_log; } -uint64_t PrecomputeMinLogNumberToKeep( +uint64_t FindMinPrepLogReferencedByMemTable( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector*>& memtables_to_flush) { + uint64_t min_log = 0; + + std::unordered_set cfds_to_flush_set(cfds_to_flush.begin(), + cfds_to_flush.end()); + std::unordered_set memtables_to_flush_set; + for (const autovector* memtables : memtables_to_flush) { + memtables_to_flush_set.insert(memtables->begin(), memtables->end()); + } + for (auto loop_cfd : *vset->GetColumnFamilySet()) { + if (loop_cfd->IsDropped() || cfds_to_flush_set.count(loop_cfd)) { + continue; + } + + auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection( + &memtables_to_flush_set); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + + log = loop_cfd->mem()->GetMinLogContainingPrepSection(); + if (log > 0 && (min_log == 0 || log < min_log)) { + min_log = log; + } + } + + return min_log; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( VersionSet* vset, const ColumnFamilyData& cfd_to_flush, - autovector edit_list, - const autovector& memtables_to_flush, - LogsWithPrepTracker* prep_tracker) { + const autovector& edit_list) { assert(vset != nullptr); - assert(prep_tracker != nullptr); - // Calculate updated min_log_number_to_keep - // Since the function should only be called in 2pc mode, log number in - // the version edit should be sufficient. // Precompute the min log number containing unflushed data for the column // family being flushed (`cfd_to_flush`). @@ -713,6 +742,58 @@ uint64_t PrecomputeMinLogNumberToKeep( min_log_number_to_keep = std::min(cf_min_log_number_to_keep, min_log_number_to_keep); } + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeepNon2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists) { + assert(vset != nullptr); + assert(!cfds_to_flush.empty()); + assert(cfds_to_flush.size() == edit_lists.size()); + + uint64_t min_log_number_to_keep = port::kMaxUint64; + for (const auto& edit_list : edit_lists) { + uint64_t log = 0; + for (const auto& e : edit_list) { + if (e->HasLogNumber()) { + log = std::max(log, e->GetLogNumber()); + } + } + if (log != 0) { + min_log_number_to_keep = std::min(min_log_number_to_keep, log); + } + } + if (min_log_number_to_keep == port::kMaxUint64) { + min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber(); + for (size_t i = 1; i < cfds_to_flush.size(); i++) { + min_log_number_to_keep = + std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber()); + } + } + + std::unordered_set flushed_cfds( + cfds_to_flush.begin(), cfds_to_flush.end()); + min_log_number_to_keep = + std::min(min_log_number_to_keep, + vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds)); + + return min_log_number_to_keep; +} + +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const ColumnFamilyData& cfd_to_flush, + const autovector& edit_list, + const autovector& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + // Calculate updated min_log_number_to_keep + // Since the function should only be called in 2pc mode, log number in + // the version edit should be sufficient. + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list); // if are 2pc we must consider logs containing prepared // sections of outstanding transactions. @@ -741,7 +822,81 @@ uint64_t PrecomputeMinLogNumberToKeep( return min_log_number_to_keep; } -Status DBImpl::FinishBestEffortsRecovery() { +uint64_t PrecomputeMinLogNumberToKeep2PC( + VersionSet* vset, const autovector& cfds_to_flush, + const autovector>& edit_lists, + const autovector*>& memtables_to_flush, + LogsWithPrepTracker* prep_tracker) { + assert(vset != nullptr); + assert(prep_tracker != nullptr); + assert(cfds_to_flush.size() == edit_lists.size()); + assert(cfds_to_flush.size() == memtables_to_flush.size()); + + uint64_t min_log_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists); + + uint64_t min_log_in_prep_heap = + prep_tracker->FindMinLogContainingOutstandingPrep(); + + if (min_log_in_prep_heap != 0 && + min_log_in_prep_heap < min_log_number_to_keep) { + min_log_number_to_keep = min_log_in_prep_heap; + } + + uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable( + vset, cfds_to_flush, memtables_to_flush); + + if (min_log_refed_by_mem != 0 && + min_log_refed_by_mem < min_log_number_to_keep) { + min_log_number_to_keep = min_log_refed_by_mem; + } + + return min_log_number_to_keep; +} + +Status DBImpl::SetDBId(bool read_only) { + Status s; + // Happens when immutable_db_options_.write_dbid_to_manifest is set to true + // the very first time. + if (db_id_.empty()) { + // Check for the IDENTITY file and create it if not there. + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + // Typically Identity file is created in NewDB() and for some reason if + // it is no longer available then at this point DB ID is not in Identity + // file or Manifest. + if (s.IsNotFound()) { + // Create a new DB ID, saving to file only if allowed + if (read_only) { + db_id_ = env_->GenerateUniqueId(); + return Status::OK(); + } else { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + s = GetDbIdentityFromIdentityFile(&db_id_); + if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { + VersionEdit edit; + edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); + versions_->db_id_ = db_id_; + s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + /* new_descriptor_log */ false); + } + } else if (!read_only) { + s = SetIdentityFile(env_, dbname_, db_id_); + } + return s; +} + +Status DBImpl::DeleteUnreferencedSstFiles() { mutex_.AssertHeld(); std::vector paths; paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator))); @@ -788,7 +943,7 @@ Status DBImpl::FinishBestEffortsRecovery() { return s; } - if (largest_file_number > next_file_number) { + if (largest_file_number >= next_file_number) { versions_->next_file_number_.store(largest_file_number + 1); } @@ -797,8 +952,6 @@ Status DBImpl::FinishBestEffortsRecovery() { assert(versions_->GetColumnFamilySet()); ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault(); assert(default_cfd); - // Even if new_descriptor_log is false, we will still switch to a new - // MANIFEST and update CURRENT file, since this is in recovery. s = versions_->LogAndApply( default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir(), /*new_descriptor_log*/ false); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 71f13bacf0f..64f4ae929ea 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -33,15 +33,17 @@ extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery() #endif // !ROCKSDB_LITE && OS_LINUX namespace ROCKSDB_NAMESPACE { -Options SanitizeOptions(const std::string& dbname, const Options& src) { - auto db_options = SanitizeOptions(dbname, DBOptions(src)); +Options SanitizeOptions(const std::string& dbname, const Options& src, + bool read_only) { + auto db_options = SanitizeOptions(dbname, DBOptions(src), read_only); ImmutableDBOptions immutable_db_options(db_options); auto cf_options = SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src)); return Options(db_options, cf_options); } -DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { +DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, + bool read_only) { DBOptions result(src); if (result.env == nullptr) { @@ -59,7 +61,7 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { &result.max_open_files); } - if (result.info_log == nullptr) { + if (result.info_log == nullptr && !read_only) { Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); if (!s.ok()) { // No place suitable for logging @@ -156,7 +158,8 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { // DeleteScheduler::CleanupDirectory on the same dir later, it will be // safe std::vector filenames; - result.env->GetChildren(result.wal_dir, &filenames).PermitUncheckedError(); + Status s = result.env->GetChildren(result.wal_dir, &filenames); + s.PermitUncheckedError(); //**TODO: What to do on error? for (std::string& filename : filenames) { if (filename.find(".log.trash", filename.length() - std::string(".log.trash").length()) != @@ -172,7 +175,8 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { // was not used) auto sfm = static_cast(result.sst_file_manager.get()); for (size_t i = 0; i < result.db_paths.size(); i++) { - DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path); + DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path) + .PermitUncheckedError(); } // Create a default SstFileManager for purposes of tracking compaction size @@ -182,7 +186,7 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { NewSstFileManager(result.env, result.info_log)); result.sst_file_manager = sst_file_manager; } -#endif +#endif // !ROCKSDB_LITE if (!result.paranoid_checks) { result.skip_checking_sst_file_sizes_on_db_open = true; @@ -290,23 +294,28 @@ Status DBImpl::NewDB(std::vector* new_filenames) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); { + if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) { + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); + } std::unique_ptr file; FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); s = NewWritableFile(fs_.get(), manifest, &file, file_options); if (!s.ok()) { return s; } + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; file->SetPreallocationBlockSize( immutable_db_options_.manifest_preallocation_size); std::unique_ptr file_writer(new WritableFileWriter( - std::move(file), manifest, file_options, env_, io_tracer_, - nullptr /* stats */, immutable_db_options_.listeners)); + std::move(file), manifest, file_options, immutable_db_options_.clock, + io_tracer_, nullptr /* stats */, immutable_db_options_.listeners, + nullptr, tmp_set.Contains(FileType::kDescriptorFile))); log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); if (s.ok()) { - s = SyncManifest(env_, &immutable_db_options_, log.file()); + s = SyncManifest(&immutable_db_options_, log.file()); } } if (s.ok()) { @@ -317,7 +326,7 @@ Status DBImpl::NewDB(std::vector* new_filenames) { manifest.substr(manifest.find_last_of("/\\") + 1)); } } else { - fs_->DeleteFile(manifest, IOOptions(), nullptr); + fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError(); } return s; } @@ -410,7 +419,7 @@ Status DBImpl::Recover( } for (const std::string& file : files_in_dbname) { uint64_t number = 0; - FileType type = kLogFile; // initialize + FileType type = kWalFile; // initialize if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { // Found MANIFEST (descriptor log), thus best-efforts recovery does // not have to treat the db as empty. @@ -488,42 +497,14 @@ Status DBImpl::Recover( // TryRecover may delete previous column_family_set_. column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); - s = FinishBestEffortsRecovery(); } } if (!s.ok()) { return s; } - // Happens when immutable_db_options_.write_dbid_to_manifest is set to true - // the very first time. - if (db_id_.empty()) { - // Check for the IDENTITY file and create it if not there. - s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); - // Typically Identity file is created in NewDB() and for some reason if - // it is no longer available then at this point DB ID is not in Identity - // file or Manifest. - if (s.IsNotFound()) { - s = SetIdentityFile(env_, dbname_); - if (!s.ok()) { - return s; - } - } else if (!s.ok()) { - assert(s.IsIOError()); - return s; - } - s = GetDbIdentityFromIdentityFile(&db_id_); - if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { - VersionEdit edit; - edit.SetDBId(db_id_); - Options options; - MutableCFOptions mutable_cf_options(options); - versions_->db_id_ = db_id_; - s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &edit, &mutex_, nullptr, - false); - } - } else { - s = SetIdentityFile(env_, dbname_, db_id_); + s = SetDBId(read_only); + if (s.ok() && !read_only) { + s = DeleteUnreferencedSstFiles(); } if (immutable_db_options_.paranoid_checks && s.ok()) { @@ -545,7 +526,7 @@ Status DBImpl::Recover( std::vector files_in_wal_dir; if (s.ok()) { - // Initial max_total_in_memory_state_ before recovery logs. Log recovery + // Initial max_total_in_memory_state_ before recovery wals. Log recovery // may check this value to decide whether to flush. max_total_in_memory_state_ = 0; for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -580,32 +561,53 @@ Status DBImpl::Recover( return s; } - std::vector logs; + std::unordered_map wal_files; for (const auto& file : files_in_wal_dir) { uint64_t number; FileType type; - if (ParseFileName(file, &number, &type) && type == kLogFile) { + if (ParseFileName(file, &number, &type) && type == kWalFile) { if (is_new_db) { return Status::Corruption( "While creating a new Db, wal_dir contains " "existing log file: ", file); } else { - logs.push_back(number); + wal_files[number] = + LogFileName(immutable_db_options_.wal_dir, number); } } } - if (logs.size() > 0) { + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + if (!immutable_db_options_.best_efforts_recovery) { + // Verify WALs in MANIFEST. + s = versions_->GetWalSet().CheckWals(env_, wal_files); + } // else since best effort recovery does not recover from WALs, no need + // to check WALs. + } else if (!versions_->GetWalSet().GetWals().empty()) { + // Tracking is disabled, clear previously tracked WALs from MANIFEST, + // otherwise, in the future, if WAL tracking is enabled again, + // since the WALs deleted when WAL tracking is disabled are not persisted + // into MANIFEST, WAL check may fail. + VersionEdit edit; + WalNumber max_wal_number = + versions_->GetWalSet().GetWals().rbegin()->first; + edit.DeleteWalsBefore(max_wal_number + 1); + s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_); + } + if (!s.ok()) { + return s; + } + + if (!wal_files.empty()) { if (error_if_wal_file_exists) { return Status::Corruption( "The db was opened in readonly mode with error_if_wal_file_exists" "flag but a WAL file already exists"); } else if (error_if_data_exists_in_wals) { - for (auto& log : logs) { - std::string fname = LogFileName(immutable_db_options_.wal_dir, log); + for (auto& wal_file : wal_files) { uint64_t bytes; - s = env_->GetFileSize(fname, &bytes); + s = env_->GetFileSize(wal_file.second, &bytes); if (s.ok()) { if (bytes > 0) { return Status::Corruption( @@ -617,13 +619,19 @@ Status DBImpl::Recover( } } - if (!logs.empty()) { - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - bool corrupted_log_found = false; - s = RecoverLogFiles(logs, &next_sequence, read_only, - &corrupted_log_found); - if (corrupted_log_found && recovered_seq != nullptr) { + if (!wal_files.empty()) { + // Recover in the order in which the wals were generated + std::vector wals; + wals.reserve(wal_files.size()); + for (const auto& wal_file : wal_files) { + wals.push_back(wal_file.first); + } + std::sort(wals.begin(), wals.end()); + + bool corrupted_wal_found = false; + s = RecoverLogFiles(wals, &next_sequence, read_only, + &corrupted_wal_found); + if (corrupted_wal_found && recovered_seq != nullptr) { *recovered_seq = next_sequence; } if (!s.ok()) { @@ -776,10 +784,10 @@ Status DBImpl::InitPersistStatsColumnFamily() { return s; } -// REQUIRES: log_numbers are sorted in ascending order -Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, +// REQUIRES: wal_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, SequenceNumber* next_sequence, bool read_only, - bool* corrupted_log_found) { + bool* corrupted_wal_found) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; @@ -809,10 +817,10 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, auto stream = event_logger_.Log(); stream << "job" << job_id << "event" << "recovery_started"; - stream << "log_files"; + stream << "wal_files"; stream.StartArray(); - for (auto log_number : log_numbers) { - stream << log_number; + for (auto wal_number : wal_numbers) { + stream << wal_number; } stream.EndArray(); } @@ -835,25 +843,25 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, bool stop_replay_by_wal_filter = false; bool stop_replay_for_corruption = false; bool flushed = false; - uint64_t corrupted_log_number = kMaxSequenceNumber; - uint64_t min_log_number = MinLogNumberToKeep(); - for (auto log_number : log_numbers) { - if (log_number < min_log_number) { + uint64_t corrupted_wal_number = kMaxSequenceNumber; + uint64_t min_wal_number = MinLogNumberToKeep(); + for (auto wal_number : wal_numbers) { + if (wal_number < min_wal_number) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Skipping log #%" PRIu64 " since it is older than min log to keep #%" PRIu64, - log_number, min_log_number); + wal_number, min_wal_number); continue; } // The previous incarnation may not have written any MANIFEST // records after allocating this log number. So we manually // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(log_number); + versions_->MarkFileNumberUsed(wal_number); // Open the log file - std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); + std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number); ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Recovering log #%" PRIu64 " mode %d", log_number, + "Recovering log #%" PRIu64 " mode %d", wal_number, static_cast(immutable_db_options_.wal_recovery_mode)); auto logFileDropped = [this, &fname]() { uint64_t bytes; @@ -906,7 +914,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // to be skipped instead of propagating bad information (like overly // large sequence numbers). log::Reader reader(immutable_db_options_.info_log, std::move(file_reader), - &reporter, true /*checksum*/, log_number); + &reporter, true /*checksum*/, wal_number); // Determine if we should tolerate incomplete records at the tail end of the // Read all the records and add to a memtable @@ -954,7 +962,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, WalFilter::WalProcessingOption wal_processing_option = immutable_db_options_.wal_filter->LogRecordFound( - log_number, fname, batch, &new_batch, &batch_changed); + wal_number, fname, batch, &new_batch, &batch_changed); switch (wal_processing_option) { case WalFilter::WalProcessingOption::kContinueProcessing: @@ -1006,7 +1014,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, " mode %d log filter %s returned " "more records (%d) than original (%d) which is not allowed. " "Aborting recovery.", - log_number, + wal_number, static_cast(immutable_db_options_.wal_recovery_mode), immutable_db_options_.wal_filter->Name(), new_count, original_count); @@ -1033,7 +1041,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, bool has_valid_writes = false; status = WriteBatchInternal::InsertInto( &batch, column_family_memtables_.get(), &flush_scheduler_, - &trim_history_scheduler_, true, log_number, this, + &trim_history_scheduler_, true, wal_number, this, false /* concurrent_memtable_writes */, next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); MaybeIgnoreError(&status); @@ -1053,7 +1061,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, cfd->UnrefAndTryDelete(); // If this asserts, it means that InsertInto failed in // filtering updates to already-flushed column families - assert(cfd->GetLogNumber() <= log_number); + assert(cfd->GetLogNumber() <= wal_number); auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; @@ -1090,21 +1098,21 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, " seq #%" PRIu64 ". %s. This likely mean loss of synced WAL, " "thus recovery fails.", - log_number, *next_sequence, + wal_number, *next_sequence, status.ToString().c_str()); return status; } // We should ignore the error but not continue replaying status = Status::OK(); stop_replay_for_corruption = true; - corrupted_log_number = log_number; - if (corrupted_log_found != nullptr) { - *corrupted_log_found = true; + corrupted_wal_number = wal_number; + if (corrupted_wal_found != nullptr) { + *corrupted_wal_found = true; } ROCKS_LOG_INFO(immutable_db_options_.info_log, "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64, - log_number, *next_sequence); + wal_number, *next_sequence); } else { assert(immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords || @@ -1130,7 +1138,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // corruption. This could during PIT recovery when the WAL is corrupted and // some (but not all) CFs are flushed // Exclude the PIT case where no log is dropped after the corruption point. - // This is to cover the case for empty logs after corrupted log, in which we + // This is to cover the case for empty wals after corrupted log, in which we // don't reset stop_replay_for_corruption. if (stop_replay_for_corruption == true && (immutable_db_options_.wal_recovery_mode == @@ -1138,11 +1146,29 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords)) { for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->GetLogNumber() > corrupted_log_number) { + // One special case cause cfd->GetLogNumber() > corrupted_wal_number but + // the CF is still consistent: If a new column family is created during + // the flush and the WAL sync fails at the same time, the new CF points to + // the new WAL but the old WAL is curropted. Since the new CF is empty, it + // is still consistent. We add the check of CF sst file size to avoid the + // false positive alert. + + // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to + // the ignorance of a very rare inconsistency case caused in data + // canclation. One CF is empty due to KV deletion. But those operations + // are in the WAL. If the WAL is corrupted, the status of this CF might + // not be consistent with others. However, the consistency check will be + // bypassed due to empty CF. + // TODO: a better and complete implementation is needed to ensure strict + // consistency check in WAL recovery including hanlding the tailing + // issues. + if (cfd->GetLogNumber() > corrupted_wal_number && + cfd->GetLiveSstFilesSize() > 0) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Column family inconsistency: SST file contains data" " beyond the point of corruption."); - return Status::Corruption("SST file is ahead of WALs"); + return Status::Corruption("SST file is ahead of WALs in CF " + + cfd->GetName()); } } } @@ -1153,16 +1179,16 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, if (!read_only) { // no need to refcount since client still doesn't have access // to the DB and can not drop column families while we iterate - auto max_log_number = log_numbers.back(); + const WalNumber max_wal_number = wal_numbers.back(); for (auto cfd : *versions_->GetColumnFamilySet()) { auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; - if (cfd->GetLogNumber() > max_log_number) { + if (cfd->GetLogNumber() > max_wal_number) { // Column family cfd has already flushed the data - // from all logs. Memtable has to be empty because - // we filter the updates based on log_number + // from all wals. Memtable has to be empty because + // we filter the updates based on wal_number // (in WriteBatch::InsertInto) assert(cfd->mem()->GetFirstSequenceNumber() == 0); assert(edit->NumEntries() == 0); @@ -1194,13 +1220,13 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // Update the log number info in the version edit corresponding to this // column family. Note that the version edits will be written to MANIFEST // together later. - // writing log_number in the manifest means that any log file - // with number strongly less than (log_number + 1) is already + // writing wal_number in the manifest means that any log file + // with number strongly less than (wal_number + 1) is already // recovered and should be ignored on next reincarnation. - // Since we already recovered max_log_number, we want all logs - // with numbers `<= max_log_number` (includes this one) to be ignored + // Since we already recovered max_wal_number, we want all wals + // with numbers `<= max_wal_number` (includes this one) to be ignored if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { - edit->SetLogNumber(max_log_number + 1); + edit->SetLogNumber(max_wal_number + 1); } } if (status.ok()) { @@ -1208,7 +1234,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any // log number - versions_->MarkFileNumberUsed(max_log_number + 1); + versions_->MarkFileNumberUsed(max_wal_number + 1); autovector cfds; autovector cf_opts; @@ -1220,6 +1246,14 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, assert(iter != version_edits.end()); edit_lists.push_back({&iter->second}); } + + std::unique_ptr wal_deletion; + if (immutable_db_options_.track_and_verify_wals_in_manifest) { + wal_deletion.reset(new VersionEdit); + wal_deletion->DeleteWalsBefore(max_wal_number + 1); + edit_lists.back().push_back(wal_deletion.get()); + } + // write MANIFEST with update status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_, directories_.GetDbDir(), @@ -1227,8 +1261,17 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, } } - if (status.ok() && data_seen && !flushed) { - status = RestoreAliveLogFiles(log_numbers); + if (status.ok()) { + if (data_seen && !flushed) { + status = RestoreAliveLogFiles(wal_numbers); + } else { + // If there's no data in the WAL, or we flushed all the data, still + // truncate the log file. If the process goes into a crash loop before + // the file is deleted, the preallocated space will never get freed. + const bool truncate = !read_only; + GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr) + .PermitUncheckedError(); + } } event_logger_.Log() << "job" << job_id << "event" @@ -1237,8 +1280,42 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, return status; } -Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { - if (log_numbers.empty()) { +Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, + LogFileNumberSize* log_ptr) { + LogFileNumberSize log(wal_number); + std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number); + Status s; + // This gets the appear size of the wals, not including preallocated space. + s = env_->GetFileSize(fname, &log.size); + if (s.ok() && truncate) { + std::unique_ptr last_log; + Status truncate_status = fs_->ReopenWritableFile( + fname, + fs_->OptimizeForLogWrite( + file_options_, + BuildDBOptions(immutable_db_options_, mutable_db_options_)), + &last_log, nullptr); + if (truncate_status.ok()) { + truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); + } + if (truncate_status.ok()) { + truncate_status = last_log->Close(IOOptions(), nullptr); + } + // Not a critical error if fail to truncate. + if (!truncate_status.ok() && !truncate_status.IsNotSupported()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to truncate log #%" PRIu64 ": %s", wal_number, + truncate_status.ToString().c_str()); + } + } + if (log_ptr) { + *log_ptr = log; + } + return s; +} + +Status DBImpl::RestoreAliveLogFiles(const std::vector& wal_numbers) { + if (wal_numbers.empty()) { return Status::OK(); } Status s; @@ -1251,40 +1328,18 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { // FindObsoleteFiles() total_log_size_ = 0; log_empty_ = false; - for (auto log_number : log_numbers) { - LogFileNumberSize log(log_number); - std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number); - // This gets the appear size of the logs, not including preallocated space. - s = env_->GetFileSize(fname, &log.size); + for (auto wal_number : wal_numbers) { + // We preallocate space for wals, but then after a crash and restart, those + // preallocated space are not needed anymore. It is likely only the last + // log has such preallocated space, so we only truncate for the last log. + LogFileNumberSize log; + s = GetLogSizeAndMaybeTruncate( + wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log); if (!s.ok()) { break; } total_log_size_ += log.size; alive_log_files_.push_back(log); - // We preallocate space for logs, but then after a crash and restart, those - // preallocated space are not needed anymore. It is likely only the last - // log has such preallocated space, so we only truncate for the last log. - if (log_number == log_numbers.back()) { - std::unique_ptr last_log; - Status truncate_status = fs_->ReopenWritableFile( - fname, - fs_->OptimizeForLogWrite( - file_options_, - BuildDBOptions(immutable_db_options_, mutable_db_options_)), - &last_log, nullptr); - if (truncate_status.ok()) { - truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr); - } - if (truncate_status.ok()) { - truncate_status = last_log->Close(IOOptions(), nullptr); - } - // Not a critical error if fail to truncate. - if (!truncate_status.ok()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Failed to truncate log #%" PRIu64 ": %s", log_number, - truncate_status.ToString().c_str()); - } - } } if (two_write_queues_) { log_write_mutex_.Unlock(); @@ -1295,7 +1350,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector& log_numbers) { Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit) { mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); + const uint64_t start_micros = immutable_db_options_.clock->NowMicros(); FileMetaData meta; std::vector blob_file_additions; @@ -1323,7 +1378,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, cfd->GetLatestMutableCFOptions()->paranoid_file_checks; int64_t _current_time = 0; - env_->GetCurrentTime(&_current_time) + immutable_db_options_.clock->GetCurrentTime(&_current_time) .PermitUncheckedError(); // ignore error const uint64_t current_time = static_cast(_current_time); meta.oldest_ancester_time = current_time; @@ -1348,21 +1403,23 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, } IOStatus io_s; - s = BuildTable( - dbname_, versions_.get(), env_, fs_.get(), *cfd->ioptions(), - mutable_cf_options, file_options_for_compaction_, cfd->table_cache(), - iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, - cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), - cfd->GetID(), cfd->GetName(), snapshot_seqs, - earliest_write_conflict_snapshot, snapshot_checker, + TableBuilderOptions tboptions( + *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(), + cfd->int_tbl_prop_collector_factories(), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), - mutable_cf_options.sample_for_compression, - mutable_cf_options.compression_opts, paranoid_file_checks, - cfd->internal_stats(), TableFileCreationReason::kRecovery, &io_s, - io_tracer_, &event_logger_, job_id, Env::IO_HIGH, - nullptr /* table_properties */, -1 /* level */, current_time, - 0 /* oldest_key_time */, write_hint, 0 /* file_creation_time */, - db_id_, db_session_id_); + mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, current_time, + 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_, + db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber()); + s = BuildTable( + dbname_, versions_.get(), immutable_db_options_, tboptions, + file_options_for_compaction_, cfd->table_cache(), iter.get(), + std::move(range_del_iters), &meta, &blob_file_additions, + snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, + paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, + &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, + write_hint, nullptr /*full_history_ts_low*/, &blob_callback_); LogFlush(immutable_db_options_.info_log); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" @@ -1370,6 +1427,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), s.ToString().c_str()); mutex_.Lock(); + + io_s.PermitUncheckedError(); // TODO(AR) is this correct, or should we + // return io_s if not ok()? } } ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); @@ -1377,7 +1437,6 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. const bool has_output = meta.fd.GetFileSize() > 0; - assert(has_output || blob_file_additions.empty()); constexpr int level = 0; @@ -1389,26 +1448,30 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.oldest_ancester_time, meta.file_creation_time, meta.file_checksum, meta.file_checksum_func_name); - edit->SetBlobFileAdditions(std::move(blob_file_additions)); + for (const auto& blob : blob_file_additions) { + edit->AddBlobFile(blob); + } } InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); - stats.micros = env_->NowMicros() - start_micros; + stats.micros = immutable_db_options_.clock->NowMicros() - start_micros; if (has_output) { stats.bytes_written = meta.fd.GetFileSize(); + stats.num_output_files = 1; + } - const auto& blobs = edit->GetBlobFileAdditions(); - for (const auto& blob : blobs) { - stats.bytes_written += blob.GetTotalBlobBytes(); - } - - stats.num_output_files = static_cast(blobs.size()) + 1; + const auto& blobs = edit->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); } + stats.num_output_files_blob = static_cast(blobs.size()); + cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats); - cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, - stats.bytes_written); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); return s; } @@ -1498,9 +1561,11 @@ IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, lfile->SetPreallocationBlockSize(preallocate_block_size); const auto& listeners = immutable_db_options_.listeners; + FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( - std::move(lfile), log_fname, opt_file_options, env_, io_tracer_, - nullptr /* stats */, listeners)); + std::move(lfile), log_fname, opt_file_options, + immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners, + nullptr, tmp_set.Contains(FileType::kWalFile))); *new_log = new log::Writer(std::move(file_writer), log_file_num, immutable_db_options_.recycle_log_file_num > 0, immutable_db_options_.manual_wal_flush); @@ -1581,6 +1646,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, InstrumentedMutexLock wl(&impl->log_write_mutex_); impl->logfile_number_ = new_log_number; assert(new_log != nullptr); + assert(impl->logs_.empty()); impl->logs_.emplace_back(new_log_number, new_log); } @@ -1635,7 +1701,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // In WritePrepared there could be gap in sequence numbers. This breaks // the trick we use in kPointInTimeRecovery which assumes the first seq in // the log right after the corrupted log is one larger than the last seq - // we read from the logs. To let this trick keep working, we add a dummy + // we read from the wals. To let this trick keep working, we add a dummy // entry with the expected sequence to the first log right after recovery. // In non-WritePrepared case also the new log after recovery could be // empty, and thus missing the consecutive seq hint to distinguish @@ -1728,6 +1794,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, std::vector metadata; + // TODO: Once GetLiveFilesMetaData supports blob files, update the logic + // below to get known_file_sizes for blob files. impl->mutex_.Lock(); impl->versions_->GetLiveFilesMetaData(&metadata); impl->mutex_.Unlock(); @@ -1753,21 +1821,19 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, paths.erase(std::unique(paths.begin(), paths.end()), paths.end()); for (auto& path : paths) { std::vector existing_files; - // TODO: Check for errors here? impl->immutable_db_options_.env->GetChildren(path, &existing_files) - .PermitUncheckedError(); + .PermitUncheckedError(); //**TODO: What do to on error? for (auto& file_name : existing_files) { uint64_t file_number; FileType file_type; std::string file_path = path + "/" + file_name; if (ParseFileName(file_name, &file_number, &file_type) && - file_type == kTableFile) { + (file_type == kTableFile || file_type == kBlobFile)) { // TODO: Check for errors from OnAddFile? if (known_file_sizes.count(file_name)) { // We're assuming that each sst file name exists in at most one of // the paths. - sfm->OnAddFile(file_path, known_file_sizes.at(file_name), - /* compaction */ false) + sfm->OnAddFile(file_path, known_file_sizes.at(file_name)) .PermitUncheckedError(); } else { sfm->OnAddFile(file_path).PermitUncheckedError(); diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 57825afbb38..825e83d2fb8 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -6,7 +6,7 @@ #include "db/db_impl/db_impl_readonly.h" #include "db/arena_wrapped_db_iter.h" -#include "db/compacted_db_impl.h" +#include "db/db_impl/compacted_db_impl.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/merge_context.h" @@ -19,7 +19,8 @@ namespace ROCKSDB_NAMESPACE { DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, const std::string& dbname) - : DBImpl(db_options, dbname) { + : DBImpl(db_options, dbname, /*seq_per_batch*/ false, + /*batch_per_txn*/ true, /*read_only*/ true) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Opening the db in read only mode"); LogFlush(immutable_db_options_.info_log); @@ -83,7 +84,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ReadCallback* read_callback = nullptr; // No read callback provided. auto db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, - read_seq, + super_version->current, read_seq, super_version->mutable_cf_options.max_sequential_skip_in_iterations, super_version->version_number, read_callback); auto internal_iter = NewInternalIterator( @@ -115,7 +116,8 @@ Status DBImplReadOnly::NewIterators( auto* cfd = static_cast_with_check(cfh)->cfd(); auto* sv = cfd->GetSuperVersion()->Ref(); auto* db_iter = NewArenaWrappedDbIterator( - env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq, + env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, + sv->current, read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, sv->version_number, read_callback); auto* internal_iter = NewInternalIterator( @@ -130,8 +132,8 @@ Status DBImplReadOnly::NewIterators( } namespace { -// Return OK if dbname exists in the file system -// or create_if_missing is false +// Return OK if dbname exists in the file system or create it if +// create_if_missing Status OpenForReadOnlyCheckExistence(const DBOptions& db_options, const std::string& dbname) { Status s; @@ -142,9 +144,9 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options, uint64_t manifest_file_number; s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path, &manifest_file_number); - if (!s.ok()) { - return Status::NotFound(CurrentFileName(dbname), "does not exist"); - } + } else { + // Historic behavior that doesn't necessarily make sense + s = db_options.env->CreateDirIfMissing(dbname); } return s; } @@ -152,7 +154,6 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options, Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, DB** dbptr, bool /*error_if_wal_file_exists*/) { - // If dbname does not exist in the file system, should not do anything Status s = OpenForReadOnlyCheckExistence(options, dbname); if (!s.ok()) { return s; diff --git a/db/db_impl/db_impl_remote_compaction.cc b/db/db_impl/db_impl_remote_compaction.cc index 3fc4d5506ff..f8e9f381932 100644 --- a/db/db_impl/db_impl_remote_compaction.cc +++ b/db/db_impl/db_impl_remote_compaction.cc @@ -127,15 +127,18 @@ Status DBImpl::doCompact(const CompactionOptions& compact_options, // create compaction job CompactionJob compaction_job( - job_context->job_id, c.get(), immutable_db_options_, + job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_, file_options_for_compaction_, versions_.get(), &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(), - GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_, - &error_handler_, existing_snapshots, earliest_write_conflict_snapshot, - snapshot_checker, table_cache_, &event_logger_, + GetDataDir(c->column_family_data(), c->output_path_id()), + GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_, + existing_snapshots, earliest_write_conflict_snapshot, snapshot_checker, + table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, Env::Priority::USER, nullptr); + &compaction_job_stats, Env::Priority::USER, io_tracer_, + &manual_compaction_paused_, nullptr, db_id_, db_session_id_, + c->column_family_data()->GetFullHistoryTsLow()); compaction_job.Prepare(); mutex_.Unlock(); diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index c0572948e23..10c04b3e169 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -11,14 +11,17 @@ #include "db/merge_context.h" #include "logging/auto_roll_logger.h" #include "monitoring/perf_context_imp.h" +#include "rocksdb/configurable.h" #include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE DBImplSecondary::DBImplSecondary(const DBOptions& db_options, - const std::string& dbname) - : DBImpl(db_options, dbname) { + const std::string& dbname, + std::string secondary_path) + : DBImpl(db_options, dbname, false, true, true), + secondary_path_(std::move(secondary_path)) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Opening the db in secondary mode"); LogFlush(immutable_db_options_.info_log); @@ -112,7 +115,7 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { for (size_t i = 0; i < filenames.size(); i++) { uint64_t number; FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && + if (ParseFileName(filenames[i], &number, &type) && type == kWalFile && number >= log_number_min) { logs->push_back(number); } @@ -327,8 +330,8 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { assert(pinnable_val != nullptr); - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_GET); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); auto cfh = static_cast(column_family); @@ -421,7 +424,7 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); auto db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options, - snapshot, + super_version->current, snapshot, super_version->mutable_cf_options.max_sequential_skip_in_iterations, super_version->version_number, read_callback); auto internal_iter = NewInternalIterator( @@ -519,7 +522,8 @@ Status DBImplSecondary::TryCatchUpWithPrimary() { { InstrumentedMutexLock lock_guard(&mutex_); s = static_cast_with_check(versions_.get()) - ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + ->ReadAndApply(&mutex_, &manifest_reader_, + manifest_reader_status_.get(), &cfds_changed); ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, static_cast(versions_->LastSequence())); @@ -616,7 +620,7 @@ Status DB::OpenAsSecondary( } handles->clear(); - DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); + DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path); impl->versions_.reset(new ReactiveVersionSet( dbname, &impl->immutable_db_options_, impl->file_options_, impl->table_cache_.get(), impl->write_buffer_manager_, @@ -662,6 +666,159 @@ Status DB::OpenAsSecondary( } return s; } + +Status DBImplSecondary::CompactWithoutInstallation( + ColumnFamilyHandle* cfh, const CompactionServiceInput& input, + CompactionServiceResult* result) { + InstrumentedMutexLock l(&mutex_); + auto cfd = static_cast_with_check(cfh)->cfd(); + if (!cfd) { + return Status::InvalidArgument("Cannot find column family" + + cfh->GetName()); + } + + std::unordered_set input_set; + for (const auto& file_name : input.input_files) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + auto* version = cfd->current(); + + ColumnFamilyMetaData cf_meta; + version->GetColumnFamilyMetaData(&cf_meta); + + const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions(); + VersionStorageInfo* vstorage = version->storage_info(); + + // Use comp_options to reuse some CompactFiles functions + CompactionOptions comp_options; + comp_options.compression = kDisableCompressionOption; + comp_options.output_file_size_limit = MaxFileSizeForLevel( + *mutable_cf_options, input.output_level, cf_options.compaction_style, + vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes); + + std::vector input_files; + Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, vstorage, comp_options); + if (!s.ok()) { + return s; + } + + std::unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->CompactFiles( + comp_options, input_files, input.output_level, vstorage, + *mutable_cf_options, mutable_db_options_, 0)); + assert(c != nullptr); + + c->SetInputVersion(version); + + // Create output directory if it's not existed yet + std::unique_ptr output_dir; + s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir); + if (!s.ok()) { + return s; + } + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, + immutable_db_options_.info_log.get()); + + const int job_id = next_job_id_.fetch_add(1); + + CompactionServiceCompactionJob compaction_job( + job_id, c.get(), immutable_db_options_, mutable_db_options_, + file_options_for_compaction_, versions_.get(), &shutting_down_, + &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_, + input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_, + db_id_, db_session_id_, secondary_path_, input, result); + + mutex_.Unlock(); + s = compaction_job.Run(); + mutex_.Lock(); + + // clean up + compaction_job.io_status().PermitUncheckedError(); + compaction_job.CleanupCompaction(); + c->ReleaseCompactionFiles(s); + c.reset(); + + TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End", + &s); + result->status = s; + return s; +} + +Status DB::OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* result, + const CompactionServiceOptionsOverride& override_options) { + CompactionServiceInput compaction_input; + Status s = CompactionServiceInput::Read(input, &compaction_input); + if (!s.ok()) { + return s; + } + + compaction_input.db_options.max_open_files = -1; + compaction_input.db_options.compaction_service = nullptr; + if (compaction_input.db_options.statistics) { + compaction_input.db_options.statistics.reset(); + } + compaction_input.db_options.env = override_options.env; + compaction_input.db_options.file_checksum_gen_factory = + override_options.file_checksum_gen_factory; + compaction_input.column_family.options.comparator = + override_options.comparator; + compaction_input.column_family.options.merge_operator = + override_options.merge_operator; + compaction_input.column_family.options.compaction_filter = + override_options.compaction_filter; + compaction_input.column_family.options.compaction_filter_factory = + override_options.compaction_filter_factory; + compaction_input.column_family.options.prefix_extractor = + override_options.prefix_extractor; + compaction_input.column_family.options.table_factory = + override_options.table_factory; + compaction_input.column_family.options.sst_partitioner_factory = + override_options.sst_partitioner_factory; + + std::vector column_families; + column_families.push_back(compaction_input.column_family); + // TODO: we have to open default CF, because of an implementation limitation, + // currently we just use the same CF option from input, which is not collect + // and open may fail. + if (compaction_input.column_family.name != kDefaultColumnFamilyName) { + column_families.emplace_back(kDefaultColumnFamilyName, + compaction_input.column_family.options); + } + + DB* db; + std::vector handles; + + s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory, + column_families, &handles, &db); + if (!s.ok()) { + return s; + } + + CompactionServiceResult compaction_result; + DBImplSecondary* db_secondary = static_cast_with_check(db); + assert(handles.size() > 0); + s = db_secondary->CompactWithoutInstallation(handles[0], compaction_input, + &compaction_result); + + Status serialization_status = compaction_result.Write(result); + + for (auto& handle : handles) { + delete handle; + } + delete db; + if (s.ok()) { + return serialization_status; + } + return s; +} + #else // !ROCKSDB_LITE Status DB::OpenAsSecondary(const Options& /*options*/, diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index 8fc58616fe2..e278b79cde2 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -71,7 +71,8 @@ class LogReaderContainer { // effort attempts to catch up with the primary. class DBImplSecondary : public DBImpl { public: - DBImplSecondary(const DBOptions& options, const std::string& dbname); + DBImplSecondary(const DBOptions& options, const std::string& dbname, + std::string secondary_path); ~DBImplSecondary() override; // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ @@ -222,6 +223,14 @@ class DBImplSecondary : public DBImpl { // not flag the missing file as inconsistency. Status CheckConsistency() override; +#ifndef NDEBUG + Status TEST_CompactWithoutInstallation(ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result) { + return CompactWithoutInstallation(cfh, input, result); + } +#endif // NDEBUG + protected: // ColumnFamilyCollector is a write batch handler which does nothing // except recording unique column family IDs @@ -316,6 +325,13 @@ class DBImplSecondary : public DBImpl { std::unordered_set* cfds_changed, JobContext* job_context); + // Run compaction without installation, the output files will be placed in the + // secondary DB path. The LSM tree won't be changed, the secondary DB is still + // in read-only mode. + Status CompactWithoutInstallation(ColumnFamilyHandle* cfh, + const CompactionServiceInput& input, + CompactionServiceResult* result); + std::unique_ptr manifest_reader_; std::unique_ptr manifest_reporter_; std::unique_ptr manifest_reader_status_; @@ -326,6 +342,8 @@ class DBImplSecondary : public DBImpl { // Current WAL number replayed for each column family. std::unordered_map cfd_to_current_log_; + + const std::string secondary_path_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 0a7cd2fa47b..5896b5a9f77 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -163,10 +163,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, RecordTick(stats_, WRITE_WITH_WAL); } - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats, + DB_WRITE); write_thread_.JoinBatchGroup(&w); - Status status; if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { // we are a non-leader in a parallel group @@ -196,8 +196,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } assert(w.state == WriteThread::STATE_COMPLETED); // STATE_COMPLETED conditional below handles exit - - status = w.FinalStatus(); } if (w.state == WriteThread::STATE_COMPLETED) { if (log_used != nullptr) { @@ -207,13 +205,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, *seq_used = w.sequence; } // write is complete and leader has updated sequence - // Should we handle it? - status.PermitUncheckedError(); return w.FinalStatus(); } // else we are the leader of the write batch group assert(w.state == WriteThread::STATE_GROUP_LEADER); - + Status status; // Once reaches this point, the current writer "w" will try to do its write // job. It may also pick up some of the remaining writers in the "writers_" // when it finds suitable, and finish them in the same write batch. @@ -429,7 +425,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (need_log_sync) { mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, status); + if (status.ok()) { + status = MarkLogsSynced(logfile_number_, need_log_dir_sync); + } else { + MarkLogsNotSynced(logfile_number_); + } mutex_.Unlock(); // Requesting sync with two_write_queues_ is expected to be very rare. We // hence provide a simple implementation that is not necessarily efficient. @@ -469,7 +469,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, uint64_t* log_used, uint64_t log_ref, bool disable_memtable, uint64_t* seq_used) { PERF_TIMER_GUARD(write_pre_and_post_process_time); - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats, + DB_WRITE); WriteContext write_context; @@ -530,6 +531,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, PERF_TIMER_STOP(write_pre_and_post_process_time); IOStatus io_s; + io_s.PermitUncheckedError(); // Allow io_s to be uninitialized + if (w.status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); @@ -554,14 +557,23 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, if (need_log_sync) { mutex_.Lock(); - MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status); + if (w.status.ok()) { + w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync); + } else { + MarkLogsNotSynced(logfile_number_); + } mutex_.Unlock(); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } + // NOTE: the memtable_write_group is declared before the following + // `if` statement because its lifetime needs to be longer + // that the inner context of the `if` as a reference to it + // may be used further below within the outer _write_thread WriteThread::WriteGroup memtable_write_group; + if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { PERF_TIMER_GUARD(write_memtable_time); assert(w.ShouldWriteToMemtable()); @@ -578,6 +590,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, versions_->SetLastSequence(memtable_write_group.last_sequence); write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); } + } else { + // NOTE: the memtable_write_group is never really used, + // so we need to set its status to pass ASSERT_STATUS_CHECKED + memtable_write_group.status.PermitUncheckedError(); } if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) { @@ -610,7 +626,8 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, SequenceNumber seq, const size_t sub_batch_cnt) { PERF_TIMER_GUARD(write_pre_and_post_process_time); - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats, + DB_WRITE); WriteThread::Writer w(write_options, my_batch, callback, log_ref, false /*disable_memtable*/); @@ -661,12 +678,12 @@ Status DBImpl::WriteImplWALOnly( const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, const PublishLastSeq publish_last_seq, const bool disable_memtable) { - Status status; PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, disable_memtable, sub_batch_cnt, pre_release_callback); RecordTick(stats_, WRITE_WITH_WAL); - StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats, + DB_WRITE); write_thread->JoinBatchGroup(&w); assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); @@ -683,6 +700,8 @@ Status DBImpl::WriteImplWALOnly( assert(w.state == WriteThread::STATE_GROUP_LEADER); if (publish_last_seq == kDoPublishLastSeq) { + Status status; + // Currently we only use kDoPublishLastSeq in unordered_write assert(immutable_db_options_.unordered_write); WriteContext write_context; @@ -759,7 +778,9 @@ Status DBImpl::WriteImplWALOnly( } seq_inc = total_batch_cnt; } + Status status; IOStatus io_s; + io_s.PermitUncheckedError(); // Allow io_s to be uninitialized if (!write_options.disableWAL) { io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc); status = io_s; @@ -845,8 +866,7 @@ void DBImpl::WriteStatusCheckOnLocked(const Status& status) { if (immutable_db_options_.paranoid_checks && !status.ok() && !status.IsBusy() && !status.IsIncomplete()) { // Maybe change the return status to void? - error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); } } @@ -858,8 +878,7 @@ void DBImpl::WriteStatusCheck(const Status& status) { !status.IsBusy() && !status.IsIncomplete()) { mutex_.Lock(); // Maybe change the return status to void? - error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback) - .PermitUncheckedError(); + error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback); mutex_.Unlock(); } } @@ -872,8 +891,7 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) { io_status.IsIOFenced()) { mutex_.Lock(); // Maybe change the return status to void? - error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback) - .PermitUncheckedError(); + error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback); mutex_.Unlock(); } } @@ -922,7 +940,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // be flushed. We may end up with flushing much more DBs than needed. It's // suboptimal but still correct. WaitForPendingWrites(); - status = HandleWriteBufferFull(write_context); + status = HandleWriteBufferManagerFlush(write_context); } if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { @@ -949,6 +967,20 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, PERF_TIMER_START(write_pre_and_post_process_time); } + // If memory usage exceeded beyond a certain threshold, + // write_buffer_manager_->ShouldStall() returns true to all threads writing to + // all DBs and writers will be stalled. + // It does soft checking because WriteBufferManager::buffer_limit_ has already + // exceeded at this point so no new write (including current one) will go + // through until memory usage is decreased. + if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) { + if (write_options.no_slowdown) { + status = Status::Incomplete("Write stall"); + } else { + WriteBufferManagerStallWrites(); + } + } + if (status.ok() && *need_log_sync) { // Wait until the parallel syncs are finished. Any sync process has to sync // the front log too so it is enough to check the status of front() @@ -1082,7 +1114,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, } if (io_s.ok() && need_log_sync) { - StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS); // It's safe to access logs_ with unlocked mutex_ here because: // - we've set getting_synced=true for all logs, // so other threads won't pop from logs_ while we're here, @@ -1330,16 +1362,23 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { } for (auto cfd : cfds) { cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWalFull); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWalFull); } - FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); MaybeScheduleFlushOrCompaction(); } return status; } -Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { +Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { mutex_.AssertHeld(); assert(write_context != nullptr); Status status; @@ -1351,7 +1390,7 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { // suboptimal but still correct. ROCKS_LOG_INFO( immutable_db_options_.info_log, - "Flushing column family with oldest memtable entry. Write buffer is " + "Flushing column family with oldest memtable entry. Write buffers are " "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".", write_buffer_manager_->memory_usage(), write_buffer_manager_->buffer_size()); @@ -1409,10 +1448,17 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { } for (const auto cfd : cfds) { cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); } - FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); MaybeScheduleFlushOrCompaction(); } return status; @@ -1432,8 +1478,10 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, uint64_t time_delayed = 0; bool delayed = false; { - StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); - uint64_t delay = write_controller_.GetDelay(env_, num_bytes); + StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, + &time_delayed); + uint64_t delay = + write_controller_.GetDelay(immutable_db_options_.clock, num_bytes); if (delay > 0) { if (write_options.no_slowdown) { return Status::Incomplete("Write stall"); @@ -1445,19 +1493,21 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, write_thread_.BeginWriteStall(); TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); mutex_.Unlock(); - // We will delay the write until we have slept for delay ms or - // we don't need a delay anymore - const uint64_t kDelayInterval = 1000; + // We will delay the write until we have slept for `delay` microseconds + // or we don't need a delay anymore. We check for cancellation every 1ms + // (slightly longer because WriteController minimum delay is 1ms, in + // case of sleep imprecision, rounding, etc.) + const uint64_t kDelayInterval = 1001; uint64_t stall_end = sw.start_time() + delay; while (write_controller_.NeedsDelay()) { - if (env_->NowMicros() >= stall_end) { + if (immutable_db_options_.clock->NowMicros() >= stall_end) { // We already delayed this write `delay` microseconds break; } delayed = true; // Sleep for 0.001 seconds - env_->SleepForMicroseconds(kDelayInterval); + immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval); } mutex_.Lock(); write_thread_.EndWriteStall(); @@ -1503,6 +1553,29 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, return s; } +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +void DBImpl::WriteBufferManagerStallWrites() { + mutex_.AssertHeld(); + // First block future writer threads who want to add themselves to the queue + // of WriteThread. + write_thread_.BeginWriteStall(); + mutex_.Unlock(); + + // Change the state to State::Blocked. + static_cast(wbm_stall_.get()) + ->SetState(WBMStallInterface::State::BLOCKED); + // Then WriteBufferManager will add DB instance to its queue + // and block this thread by calling WBMStallInterface::Block(). + write_buffer_manager_->BeginWriteStall(wbm_stall_.get()); + wbm_stall_->Block(); + + mutex_.Lock(); + // Stall has ended. Signal writer threads so that they can add + // themselves to the WriteThread queue for writes. + write_thread_.EndWriteStall(); +} + Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, WriteBatch* my_batch) { assert(write_options.low_pri); @@ -1636,10 +1709,16 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { if (status.ok()) { if (immutable_db_options_.atomic_flush) { AssignAtomicFlushSeq(cfds); + FlushRequest flush_req; + GenerateFlushRequest(cfds, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + } else { + for (auto* cfd : cfds) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, &flush_req); + SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + } } - FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); MaybeScheduleFlushOrCompaction(); } return status; @@ -1785,30 +1864,74 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { } // We may have lost data from the WritableFileBuffer in-memory buffer for // the current log, so treat it as a fatal error and set bg_error - // Should handle return error? if (!io_s.ok()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable) - .PermitUncheckedError(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable); } else { - error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable) - .PermitUncheckedError(); + error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); } // Read back bg_error in order to get the right severity s = error_handler_.GetBGError(); return s; } - for (auto loop_cfd : *versions_->GetColumnFamilySet()) { - // all this is just optimization to delete logs that - // are no longer needed -- if CF is empty, that means it - // doesn't need that particular log to stay alive, so we just - // advance the log number. no need to persist this in the manifest - if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && - loop_cfd->imm()->NumNotFlushed() == 0) { - if (creating_new_log) { - loop_cfd->SetLogNumber(logfile_number_); + bool empty_cf_updated = false; + if (immutable_db_options_.track_and_verify_wals_in_manifest && + !immutable_db_options_.allow_2pc && creating_new_log) { + // In non-2pc mode, WALs become obsolete if they do not contain unflushed + // data. Updating the empty CF's log number might cause some WALs to become + // obsolete. So we should track the WAL obsoletion event before actually + // updating the empty CF's log number. + uint64_t min_wal_number_to_keep = + versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_); + if (min_wal_number_to_keep > + versions_->GetWalSet().GetMinWalNumberToKeep()) { + // Get a snapshot of the empty column families. + // LogAndApply may release and reacquire db + // mutex, during that period, column family may become empty (e.g. its + // flush succeeds), then it affects the computed min_log_number_to_keep, + // so we take a snapshot for consistency of column family data + // status. If a column family becomes non-empty afterwards, its active log + // should still be the created new log, so the min_log_number_to_keep is + // not affected. + autovector empty_cfs; + for (auto cf : *versions_->GetColumnFamilySet()) { + if (cf->IsEmpty()) { + empty_cfs.push_back(cf); + } + } + + VersionEdit wal_deletion; + wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); + s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + if (!s.ok()) { + return s; + } + + for (auto cf : empty_cfs) { + if (cf->IsEmpty()) { + cf->SetLogNumber(logfile_number_); + cf->mem()->SetCreationSeq(versions_->LastSequence()); + } // cf may become non-empty. + } + empty_cf_updated = true; + } + } + if (!empty_cf_updated) { + for (auto cf : *versions_->GetColumnFamilySet()) { + // all this is just optimization to delete logs that + // are no longer needed -- if CF is empty, that means it + // doesn't need that particular log to stay alive, so we just + // advance the log number. no need to persist this in the manifest + if (cf->IsEmpty()) { + if (creating_new_log) { + cf->SetLogNumber(logfile_number_); + } + cf->mem()->SetCreationSeq(versions_->LastSequence()); } - loop_cfd->mem()->SetCreationSeq(versions_->LastSequence()); } } diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index 207e85faa2b..55c8bb95aa4 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -62,11 +62,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, dbname.c_str(), file.c_str()); } break; - case kLogFile: + case kWalFile: if (env->GetFileSize(dbname + "/" + file, &file_size).ok()) { - char str[16]; - snprintf(str, sizeof(str), "%" PRIu64, file_size); - wal_info.append(file).append(" size: ").append(str).append(" ; "); + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); } else { Error(options.info_log, "Error when reading LOG file: %s/%s\n", dbname.c_str(), file.c_str()); @@ -118,11 +119,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, wal_info.clear(); for (const std::string& file : files) { if (ParseFileName(file, &number, &type)) { - if (type == kLogFile) { + if (type == kWalFile) { if (env->GetFileSize(options.wal_dir + "/" + file, &file_size).ok()) { - char str[16]; - snprintf(str, sizeof(str), "%" PRIu64, file_size); - wal_info.append(file).append(" size: ").append(str).append(" ; "); + wal_info.append(file) + .append(" size: ") + .append(std::to_string(file_size)) + .append(" ; "); } else { Error(options.info_log, "Error when reading LOG file %s/%s\n", options.wal_dir.c_str(), file.c_str()); diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc index 1fcaa6904ee..232ae649ccb 100644 --- a/db/db_io_failure_test.cc +++ b/db/db_io_failure_test.cc @@ -43,11 +43,15 @@ TEST_F(DBIOFailureTest, DropWrites) { if (level > 0 && level == dbfull()->NumberLevels() - 1) { break; } - dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, - true /* disallow trivial move */); + Status s = + dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr, + true /* disallow trivial move */); + ASSERT_TRUE(s.ok() || s.IsCorruption()); } } else { - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + Status s = + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok() || s.IsCorruption()); } } @@ -56,7 +60,8 @@ TEST_F(DBIOFailureTest, DropWrites) { ASSERT_EQ("5", property_value); env_->drop_writes_.store(false, std::memory_order_release); - ASSERT_LT(CountFiles(), num_files + 3); + const size_t count = CountFiles(); + ASSERT_LT(count, num_files + 3); // Check that compaction attempts slept after errors // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler @@ -82,7 +87,8 @@ TEST_F(DBIOFailureTest, DropWritesFlush) { ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("0", property_value); - dbfull()->TEST_FlushMemTable(true); + // ASSERT file is too short + ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption()); ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("1", property_value); @@ -166,7 +172,7 @@ TEST_F(DBIOFailureTest, ManifestWriteError) { ASSERT_EQ("bar", Get("foo")); // Memtable compaction (will succeed) - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("foo")); const int last = 2; MoveFilesToLevel(2); @@ -174,7 +180,8 @@ TEST_F(DBIOFailureTest, ManifestWriteError) { // Merging compaction (will fail) error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_NOK( + dbfull()->TEST_CompactRange(last, nullptr, nullptr)); // Should fail ASSERT_EQ("bar", Get("foo")); error_type->store(false, std::memory_order_release); @@ -192,7 +199,13 @@ TEST_F(DBIOFailureTest, ManifestWriteError) { // Merging compaction (will fail) error_type->store(true, std::memory_order_release); - dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + Status s = + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + if (iter == 0) { + ASSERT_OK(s); + } else { + ASSERT_TRUE(s.IsIOError()); + } ASSERT_EQ("bar", Get("foo")); // Recovery: should not lose data @@ -220,18 +233,15 @@ TEST_F(DBIOFailureTest, PutFailsParanoid) { options.paranoid_checks = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(Put(1, "foo2", "bar2")); env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); // the next put should fail, too - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(Put(1, "foo3", "bar3")); // but we're still able to read ASSERT_EQ("bar", Get(1, "foo")); @@ -244,12 +254,10 @@ TEST_F(DBIOFailureTest, PutFailsParanoid) { ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error env_->log_write_error_.store(true, std::memory_order_release); - s = Put(1, "foo2", "bar2"); - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(Put(1, "foo2", "bar2")); env_->log_write_error_.store(false, std::memory_order_release); - s = Put(1, "foo3", "bar3"); // the next put should NOT fail - ASSERT_TRUE(s.ok()); + ASSERT_OK(Put(1, "foo3", "bar3")); } #if !(defined NDEBUG) || !defined(OS_WIN) TEST_F(DBIOFailureTest, FlushSstRangeSyncError) { @@ -269,14 +277,14 @@ TEST_F(DBIOFailureTest, FlushSstRangeSyncError) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; + const char* io_error_msg = "range sync dummy error"; std::atomic range_sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { if (range_sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("range sync dummy error"); + *st = Status::IOError(io_error_msg); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -298,7 +306,9 @@ TEST_F(DBIOFailureTest, FlushSstRangeSyncError) { ASSERT_OK(Put(1, "foo3_2", rnd_str)); ASSERT_OK(Put(1, "foo3_3", rnd_str)); ASSERT_OK(Put(1, "foo4", "bar")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -328,7 +338,6 @@ TEST_F(DBIOFailureTest, CompactSstRangeSyncError) { options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; Random rnd(301); std::string rnd_str = @@ -342,21 +351,22 @@ TEST_F(DBIOFailureTest, CompactSstRangeSyncError) { ASSERT_OK(Put(1, "foo1_1", rnd_str)); ASSERT_OK(Put(1, "foo1_2", rnd_str)); ASSERT_OK(Put(1, "foo1_3", rnd_str)); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo3_1", rnd_str)); ASSERT_OK(Put(1, "foo3_2", rnd_str)); ASSERT_OK(Put(1, "foo3_3", rnd_str)); ASSERT_OK(Put(1, "foo4", "bar")); - Flush(1); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + const char* io_error_msg = "range sync dummy error"; std::atomic range_sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::RangeSync", [&](void* arg) { if (range_sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("range sync dummy error"); + *st = Status::IOError(io_error_msg); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -365,7 +375,9 @@ TEST_F(DBIOFailureTest, CompactSstRangeSyncError) { { {"disable_auto_compactions", "false"}, })); - dbfull()->TEST_WaitForCompact(); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -389,13 +401,14 @@ TEST_F(DBIOFailureTest, FlushSstCloseError) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; + + const char* io_error_msg = "close dummy error"; std::atomic close_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Close", [&](void* arg) { if (close_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("close dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -404,7 +417,9 @@ TEST_F(DBIOFailureTest, FlushSstCloseError) { ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); ASSERT_OK(Put(1, "foo", "bar2")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -429,25 +444,25 @@ TEST_F(DBIOFailureTest, CompactionSstCloseError) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar2")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar3")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + const char* io_error_msg = "close dummy error"; std::atomic close_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Close", [&](void* arg) { if (close_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("close dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -456,7 +471,9 @@ TEST_F(DBIOFailureTest, CompactionSstCloseError) { { {"disable_auto_compactions", "false"}, })); - dbfull()->TEST_WaitForCompact(); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as compaction failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -480,13 +497,14 @@ TEST_F(DBIOFailureTest, FlushSstSyncError) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; + + const char* io_error_msg = "sync dummy error"; std::atomic sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Sync", [&](void* arg) { if (sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("sync dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -495,7 +513,9 @@ TEST_F(DBIOFailureTest, FlushSstSyncError) { ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); ASSERT_OK(Put(1, "foo", "bar2")); - dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as flush failed. ASSERT_NOK(Put(1, "foo2", "bar3")); @@ -521,25 +541,25 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar2")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "foo", "bar3")); ASSERT_OK(Put(1, "foo2", "bar")); - Flush(1); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + const char* io_error_msg = "sync dummy error"; std::atomic sync_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SpecialEnv::SStableFile::Sync", [&](void* arg) { if (sync_called.fetch_add(1) == 0) { Status* st = static_cast(arg); - *st = Status::IOError("close dummy error"); + *st = Status::IOError(io_error_msg); } }); @@ -548,7 +568,9 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) { { {"disable_auto_compactions", "false"}, })); - dbfull()->TEST_WaitForCompact(); + Status s = dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STREQ(s.getState(), io_error_msg); // Following writes should fail as compaction failed. ASSERT_NOK(Put(1, "foo2", "bar3")); diff --git a/db/db_iter.cc b/db/db_iter.cc index a9eee88dd92..75a196e4dd2 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -8,9 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_iter.h" -#include + #include #include +#include #include "db/dbformat.h" #include "db/merge_context.h" @@ -24,6 +25,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" +#include "rocksdb/system_clock.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "trace_replay/trace_replay.h" @@ -34,21 +36,24 @@ namespace ROCKSDB_NAMESPACE { DBIter::DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, - const Comparator* cmp, InternalIterator* iter, SequenceNumber s, - bool arena_mode, uint64_t max_sequential_skip_in_iterations, + const Comparator* cmp, InternalIterator* iter, + const Version* version, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob) + ColumnFamilyData* cfd, bool expose_blob_index) : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), env_(_env), - logger_(cf_options.info_log), + clock_(ioptions.clock), + logger_(ioptions.logger), user_comparator_(cmp), - merge_operator_(cf_options.merge_operator), + merge_operator_(ioptions.merge_operator.get()), iter_(iter), + version_(version), read_callback_(read_callback), sequence_(s), - statistics_(cf_options.statistics), + statistics_(ioptions.stats), max_skip_(max_sequential_skip_in_iterations), max_skippable_internal_keys_(read_options.max_skippable_internal_keys), num_internal_keys_skipped_(0), @@ -65,10 +70,12 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, expect_total_order_inner_iter_(prefix_extractor_ == nullptr || read_options.total_order_seek || read_options.auto_prefix_mode), - allow_blob_(allow_blob), + read_tier_(read_options.read_tier), + verify_checksums_(read_options.verify_checksums), + expose_blob_index_(expose_blob_index), is_blob_(false), arena_mode_(arena_mode), - range_del_agg_(&cf_options.internal_comparator, s), + range_del_agg_(&ioptions.internal_comparator, s), db_impl_(db_impl), cfd_(cfd), start_seqnum_(read_options.iter_start_seqnum), @@ -107,11 +114,11 @@ Status DBIter::GetProperty(std::string prop_name, std::string* prop) { } bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (ParseInternalKey(iter_.key(), ikey) != Status::OK()) { - status_ = Status::Corruption("corrupted internal key in DBIter"); + Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */); + if (!s.ok()) { + status_ = Status::Corruption("In DBIter: ", s.getState()); valid_ = false; - ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s", - iter_.key().ToString(true).c_str()); + ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState()); return false; } else { return true; @@ -122,7 +129,7 @@ void DBIter::Next() { assert(valid_); assert(status_.ok()); - PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); // Release temporarily pinned blocks from last operation ReleaseTempPinnedData(); local_stats_.skip_count_ += num_internal_keys_skipped_; @@ -164,6 +171,42 @@ void DBIter::Next() { } } +bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, + const Slice& blob_index) { + assert(!is_blob_); + + if (expose_blob_index_) { // Stacked BlobDB implementation + is_blob_ = true; + return true; + } + + if (!version_) { + status_ = Status::Corruption("Encountered unexpected blob index."); + valid_ = false; + return false; + } + + // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to + // avoid having to copy options back and forth. + ReadOptions read_options; + read_options.read_tier = read_tier_; + read_options.verify_checksums = verify_checksums_; + + constexpr uint64_t* bytes_read = nullptr; + + const Status s = version_->GetBlob(read_options, user_key, blob_index, + &blob_value_, bytes_read); + + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + is_blob_ = true; + return true; +} + // PRE: saved_key_ has the current user key if skipping_saved_key // POST: saved_key_ should have the next user key if valid_, // if the current entry is a result of merge @@ -220,25 +263,28 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, is_key_seqnum_zero_ = false; return false; } + Slice user_key_without_ts = + StripTimestampFromUserKey(ikey_.user_key, timestamp_size_); is_key_seqnum_zero_ = (ikey_.sequence == 0); assert(iterate_upper_bound_ == nullptr || iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound || user_comparator_.CompareWithoutTimestamp( - ikey_.user_key, /*a_has_ts=*/true, *iterate_upper_bound_, + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, /*b_has_ts=*/false) < 0); if (iterate_upper_bound_ != nullptr && iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound && user_comparator_.CompareWithoutTimestamp( - ikey_.user_key, /*a_has_ts=*/true, *iterate_upper_bound_, + user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_, /*b_has_ts=*/false) >= 0) { break; } assert(prefix == nullptr || prefix_extractor_ != nullptr); if (prefix != nullptr && - prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) { + prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) != + 0) { assert(prefix_same_as_start_); break; } @@ -315,8 +361,14 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, case kTypeBlobIndex: if (start_seqnum_ > 0) { if (ikey_.sequence >= start_seqnum_) { - assert(ikey_.type != kTypeBlobIndex); saved_key_.SetInternalKey(ikey_); + + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } + } + valid_ = true; return true; } else { @@ -330,6 +382,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, } } else if (timestamp_lb_) { saved_key_.SetInternalKey(ikey_); + + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } + } + valid_ = true; return true; } else { @@ -344,20 +403,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, num_skipped = 0; reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - } else if (ikey_.type == kTypeBlobIndex) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - valid_ = false; - return false; + } else { + if (ikey_.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + return false; + } } - is_blob_ = true; - valid_ = true; - return true; - } else { valid_ = true; return true; } @@ -436,11 +488,11 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, &last_key, ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion)); } else { - std::string min_ts(timestamp_size_, static_cast(0)); + const std::string kTsMin(timestamp_size_, '\0'); AppendInternalKeyWithDifferentTimestamp( &last_key, ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion), - min_ts); + kTsMin); } // Don't set skipping_saved_key = false because we may still see more // user-keys equal to saved_key_. @@ -497,7 +549,6 @@ bool DBIter::MergeValuesNewToOld() { TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand"); ParsedInternalKey ikey; - Status s; for (iter_.Next(); iter_.Valid(); iter_.Next()) { TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand"); if (!ParseKey(&ikey)) { @@ -525,12 +576,8 @@ bool DBIter::MergeValuesNewToOld() { // hit a put, merge the put value with operands and store the // final result in saved_value_. We are done! const Slice val = iter_.value(); - s = MergeHelper::TimedFullMerge( - merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, env_, &pinned_value_, true); + Status s = Merge(&val, ikey.user_key); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } // iter_ is positioned after put @@ -547,17 +594,31 @@ bool DBIter::MergeValuesNewToOld() { iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); } else if (kTypeBlobIndex == ikey.type) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - } else { + if (expose_blob_index_) { status_ = - Status::NotSupported("Blob DB does not support merge operator."); + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; } - valid_ = false; - return false; + // hit a put, merge the put value with operands and store the + // final result in saved_value_. We are done! + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; + } + valid_ = true; + const Slice blob_value = value(); + Status s = Merge(&blob_value, ikey.user_key); + if (!s.ok()) { + return false; + } + is_blob_ = false; + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + return true; } else { valid_ = false; status_ = Status::Corruption( @@ -576,32 +637,19 @@ bool DBIter::MergeValuesNewToOld() { // a deletion marker. // feed null as the existing value to the merge operator, such that // client can differentiate this scenario and do things accordingly. - s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(), - nullptr, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, env_, - &pinned_value_, true); + Status s = Merge(nullptr, saved_key_.GetUserKey()); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } - assert(status_.ok()); return true; } void DBIter::Prev() { - if (timestamp_size_ > 0) { - valid_ = false; - status_ = Status::NotSupported( - "SeekToLast/SeekForPrev/Prev currently not supported with timestamp."); - return; - } - assert(valid_); assert(status_.ok()); - PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); bool ok = true; @@ -636,9 +684,16 @@ bool DBIter::ReverseToForward() { // If that's the case, seek iter_ to current key. if (!expect_total_order_inner_iter() || !iter_.Valid()) { IterKey last_key; - last_key.SetInternalKey(ParsedInternalKey( - saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); iter_.Seek(last_key.GetInternalKey()); + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); } direction_ = kForward; @@ -689,6 +744,7 @@ bool DBIter::ReverseToBackward() { iter_.SeekToLast(); } } + RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); } direction_ = kReverse; @@ -703,7 +759,9 @@ void DBIter::PrevInternal(const Slice* prefix) { assert(prefix == nullptr || prefix_extractor_ != nullptr); if (prefix != nullptr && - prefix_extractor_->Transform(saved_key_.GetUserKey()) + prefix_extractor_ + ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(), + timestamp_size_)) .compare(*prefix) != 0) { assert(prefix_same_as_start_); // Current key does not have the same prefix as start @@ -712,11 +770,13 @@ void DBIter::PrevInternal(const Slice* prefix) { } assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_lower_bound_) >= 0); + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, + *iterate_lower_bound_, /*b_has_ts=*/false) >= 0); if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_lower_bound_) < 0) { + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_, + /*b_has_ts=*/false) < 0) { // We've iterated earlier than the user-specified lower bound. valid_ = false; return; @@ -761,8 +821,8 @@ bool DBIter::FindValueForCurrentKey() { assert(iter_.Valid()); merge_context_.Clear(); current_entry_is_merged_ = false; - // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or - // kTypeValue) + // last entry before merge (could be kTypeDeletion, + // kTypeDeletionWithTimestamp, kTypeSingleDeletion or kTypeValue) ValueType last_not_merge_type = kTypeDeletion; ValueType last_key_entry_type = kTypeDeletion; @@ -783,9 +843,13 @@ bool DBIter::FindValueForCurrentKey() { timestamp_size_); } if (!IsVisible(ikey.sequence, ts) || - !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + !user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { break; } + if (!ts.empty()) { + saved_timestamp_.assign(ts.data(), ts.size()); + } if (TooManyInternalKeysSkipped()) { return false; } @@ -810,14 +874,22 @@ bool DBIter::FindValueForCurrentKey() { ikey, RangeDelPositioningMode::kBackwardTraversal)) { last_key_entry_type = kTypeRangeDeletion; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - } else { - assert(iter_.iter()->IsValuePinned()); + } else if (iter_.iter()->IsValuePinned()) { pinned_value_ = iter_.value(); + } else { + valid_ = false; + status_ = Status::NotSupported( + "Backward iteration not supported if underlying iterator's value " + "cannot be pinned."); } merge_context_.Clear(); last_not_merge_type = last_key_entry_type; + if (!status_.ok()) { + return false; + } break; case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: merge_context_.Clear(); last_not_merge_type = last_key_entry_type; @@ -861,6 +933,7 @@ bool DBIter::FindValueForCurrentKey() { is_blob_ = false; switch (last_key_entry_type) { case kTypeDeletion: + case kTypeDeletionWithTimestamp: case kTypeSingleDeletion: case kTypeRangeDeletion: valid_ = false; @@ -870,43 +943,45 @@ bool DBIter::FindValueForCurrentKey() { if (last_not_merge_type == kTypeDeletion || last_not_merge_type == kTypeSingleDeletion || last_not_merge_type == kTypeRangeDeletion) { - s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), nullptr, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, - env_, &pinned_value_, true); + s = Merge(nullptr, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + return true; } else if (last_not_merge_type == kTypeBlobIndex) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - } else { + if (expose_blob_index_) { status_ = - Status::NotSupported("Blob DB does not support merge operator."); + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; } - valid_ = false; - return false; + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + return false; + } + valid_ = true; + const Slice blob_value = value(); + s = Merge(&blob_value, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + is_blob_ = false; + return true; } else { assert(last_not_merge_type == kTypeValue); - s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), &pinned_value_, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, - env_, &pinned_value_, true); + s = Merge(&pinned_value_, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + return true; } break; case kTypeValue: // do nothing - we've already has value in pinned_value_ break; case kTypeBlobIndex: - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - valid_ = false; + if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { return false; } - is_blob_ = true; break; default: valid_ = false; @@ -933,8 +1008,17 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { // FindValueForCurrentKeyUsingSeek() assert(pinned_iters_mgr_.PinningEnabled()); std::string last_key; - AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), - sequence_, kValueTypeForSeek)); + if (0 == timestamp_size_) { + AppendInternalKey(&last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek)); + } else { + AppendInternalKeyWithDifferentTimestamp( + &last_key, + ParsedInternalKey(saved_key_.GetUserKey(), sequence_, + kValueTypeForSeek), + *timestamp_ub_); + } iter_.Seek(last_key); RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION); @@ -958,7 +1042,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { timestamp_size_); } - if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { // No visible values for this key, even though FindValueForCurrentKey() // has seen some. This is possible if we're using a tailing iterator, and // the entries were discarded in a compaction. @@ -975,26 +1060,28 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || range_del_agg_.ShouldDelete( - ikey, RangeDelPositioningMode::kBackwardTraversal)) { + ikey, RangeDelPositioningMode::kBackwardTraversal) || + kTypeDeletionWithTimestamp == ikey.type) { valid_ = false; return true; } - if (ikey.type == kTypeBlobIndex && !allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - valid_ = false; - return false; - } if (!iter_.PrepareValue()) { valid_ = false; return false; } + if (timestamp_size_ > 0) { + Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_); + saved_timestamp_.assign(ts.data(), ts.size()); + } if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) { assert(iter_.iter()->IsValuePinned()); pinned_value_ = iter_.value(); - is_blob_ = (ikey.type == kTypeBlobIndex); + if (ikey.type == kTypeBlobIndex) { + if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { + return false; + } + } + valid_ = true; return true; } @@ -1034,33 +1121,33 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (ikey.type == kTypeValue) { const Slice val = iter_.value(); - Status s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), &val, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, - env_, &pinned_value_, true); + Status s = Merge(&val, saved_key_.GetUserKey()); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } - valid_ = true; return true; } else if (ikey.type == kTypeMerge) { merge_context_.PushOperand( iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); } else if (ikey.type == kTypeBlobIndex) { - if (!allow_blob_) { - ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index."); - status_ = Status::NotSupported( - "Encounter unexpected blob index. Please open DB with " - "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); - } else { + if (expose_blob_index_) { status_ = - Status::NotSupported("Blob DB does not support merge operator."); + Status::NotSupported("BlobDB does not support merge operator."); + valid_ = false; + return false; } - valid_ = false; - return false; + if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + return false; + } + valid_ = true; + const Slice blob_value = value(); + Status s = Merge(&blob_value, saved_key_.GetUserKey()); + if (!s.ok()) { + return false; + } + is_blob_ = false; + return true; } else { valid_ = false; status_ = Status::Corruption( @@ -1070,13 +1157,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } } - Status s = MergeHelper::TimedFullMerge( - merge_operator_, saved_key_.GetUserKey(), nullptr, - merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_, - &pinned_value_, true); + Status s = Merge(nullptr, saved_key_.GetUserKey()); if (!s.ok()) { - valid_ = false; - status_ = s; return false; } @@ -1099,6 +1181,19 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } +Status DBIter::Merge(const Slice* val, const Slice& user_key) { + Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, val, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, &pinned_value_, true); + if (!s.ok()) { + valid_ = false; + status_ = s; + return s; + } + valid_ = true; + return s; +} + // Move backwards until the key smaller than saved_key_. // Changes valid_ only if return value is false. bool DBIter::FindUserKeyBeforeSavedKey() { @@ -1110,7 +1205,8 @@ bool DBIter::FindUserKeyBeforeSavedKey() { return false; } - if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) { + if (user_comparator_.CompareWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey()) < 0) { return true; } @@ -1134,8 +1230,14 @@ bool DBIter::FindUserKeyBeforeSavedKey() { if (num_skipped >= max_skip_) { num_skipped = 0; IterKey last_key; - last_key.SetInternalKey(ParsedInternalKey( - saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); + ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber, + kValueTypeForSeek); + if (timestamp_size_ > 0) { + // TODO: pre-create kTsMax. + const std::string kTsMax(timestamp_size_, '\xff'); + pikey.SetTimestamp(kTsMax); + } + last_key.SetInternalKey(pikey); // It would be more efficient to use SeekForPrev() here, but some // iterators may not support it. iter_.Seek(last_key.GetInternalKey()); @@ -1212,24 +1314,50 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { saved_key_.Clear(); // now saved_key is used to store internal key. saved_key_.SetInternalKey(target, 0 /* sequence_number */, - kValueTypeForSeekForPrev); + kValueTypeForSeekForPrev, timestamp_ub_); + + if (timestamp_size_ > 0) { + const std::string kTsMin(timestamp_size_, '\0'); + Slice ts = kTsMin; + saved_key_.UpdateInternalKey(/*seq=*/0, kValueTypeForSeekForPrev, &ts); + } if (iterate_upper_bound_ != nullptr && - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_upper_bound_) >= 0) { + user_comparator_.CompareWithoutTimestamp( + saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0) { saved_key_.Clear(); - saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber, + kValueTypeForSeekForPrev, timestamp_ub_); + if (timestamp_size_ > 0) { + const std::string kTsMax(timestamp_size_, '\xff'); + Slice ts = kTsMax; + saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev, + &ts); + } } } void DBIter::Seek(const Slice& target) { - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); - StopWatch sw(env_, statistics_, DB_SEEK); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); #ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { // TODO: What do we do if this returns an error? - db_impl_->TraceIteratorSeek(cfd_->GetID(), target).PermitUncheckedError(); + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound) + .PermitUncheckedError(); } #endif // ROCKSDB_LITE @@ -1257,7 +1385,7 @@ void DBIter::Seek(const Slice& target) { // we need to find out the next key that is visible to the user. ClearSavedValue(); if (prefix_same_as_start_) { - // The case where the iterator needs to be invalidated if it has exausted + // The case where the iterator needs to be invalidated if it has exhausted // keys within the same prefix of the seek key. assert(prefix_extractor_ != nullptr); Slice target_prefix = prefix_extractor_->Transform(target); @@ -1285,24 +1413,30 @@ void DBIter::Seek(const Slice& target) { } void DBIter::SeekForPrev(const Slice& target) { - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); - StopWatch sw(env_, statistics_, DB_SEEK); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); + StopWatch sw(clock_, statistics_, DB_SEEK); #ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { // TODO: What do we do if this returns an error? - db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target) + Slice lower_bound, upper_bound; + if (iterate_lower_bound_ != nullptr) { + lower_bound = *iterate_lower_bound_; + } else { + lower_bound = Slice(""); + } + if (iterate_upper_bound_ != nullptr) { + upper_bound = *iterate_upper_bound_; + } else { + upper_bound = Slice(""); + } + db_impl_ + ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound, + upper_bound) .PermitUncheckedError(); } #endif // ROCKSDB_LITE - if (timestamp_size_ > 0) { - valid_ = false; - status_ = Status::NotSupported( - "SeekToLast/SeekForPrev/Prev currently not supported with timestamp."); - return; - } - status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); @@ -1326,7 +1460,7 @@ void DBIter::SeekForPrev(const Slice& target) { // backward direction. ClearSavedValue(); if (prefix_same_as_start_) { - // The case where the iterator needs to be invalidated if it has exausted + // The case where the iterator needs to be invalidated if it has exhausted // keys within the same prefix of the seek key. assert(prefix_extractor_ != nullptr); Slice target_prefix = prefix_extractor_->Transform(target); @@ -1353,7 +1487,7 @@ void DBIter::SeekToFirst() { Seek(*iterate_lower_bound_); return; } - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. if (!expect_total_order_inner_iter()) { @@ -1391,29 +1525,25 @@ void DBIter::SeekToFirst() { } if (valid_ && prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); - prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); } } void DBIter::SeekToLast() { - if (timestamp_size_ > 0) { - valid_ = false; - status_ = Status::NotSupported( - "SeekToLast/SeekForPrev/Prev currently not supported with timestamp."); - return; - } - if (iterate_upper_bound_ != nullptr) { // Seek to last key strictly less than ReadOptions.iterate_upper_bound. SeekForPrev(*iterate_upper_bound_); - if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) { + if (Valid() && 0 == user_comparator_.CompareWithoutTimestamp( + *iterate_upper_bound_, /*a_has_ts=*/false, key(), + /*b_has_ts=*/false)) { ReleaseTempPinnedData(); PrevInternal(nullptr); } return; } - PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); + PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. if (!expect_total_order_inner_iter()) { @@ -1442,23 +1572,25 @@ void DBIter::SeekToLast() { } if (valid_ && prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); - prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); + prefix_.SetUserKey(prefix_extractor_->Transform( + StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_))); } } Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* user_key_comparator, - InternalIterator* internal_iter, + InternalIterator* internal_iter, const Version* version, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob) { - DBIter* db_iter = new DBIter( - env, read_options, cf_options, mutable_cf_options, user_key_comparator, - internal_iter, sequence, false, max_sequential_skip_in_iterations, - read_callback, db_impl, cfd, allow_blob); + ColumnFamilyData* cfd, bool expose_blob_index) { + DBIter* db_iter = + new DBIter(env, read_options, ioptions, mutable_cf_options, + user_key_comparator, internal_iter, version, sequence, false, + max_sequential_skip_in_iterations, read_callback, db_impl, cfd, + expose_blob_index); return db_iter; } diff --git a/db/db_iter.h b/db/db_iter.h index 93b78c06ee8..52bffa55d6a 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -8,8 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include +#include #include + #include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" @@ -21,6 +22,7 @@ #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +class Version; // This file declares the factory functions of DBIter, in its original form // or a wrapped form with class ArenaWrappedDBIter, which is defined here. @@ -66,7 +68,7 @@ class DBIter final : public Iterator { // this->key(). // (2) When moving backwards, the internal iterator is positioned // just before all entries whose user key == this->key(). - enum Direction { kForward, kReverse }; + enum Direction : uint8_t { kForward, kReverse }; // LocalStatistics contain Statistics counters that will be aggregated per // each iterator instance and then will be sent to the global statistics when @@ -112,12 +114,12 @@ class DBIter final : public Iterator { }; DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* cmp, - InternalIterator* iter, SequenceNumber s, bool arena_mode, - uint64_t max_sequential_skip_in_iterations, + InternalIterator* iter, const Version* version, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob); + bool expose_blob_index); // No copying allowed DBIter(const DBIter&) = delete; @@ -159,7 +161,10 @@ class DBIter final : public Iterator { } Slice value() const override { assert(valid_); - if (current_entry_is_merged_) { + + if (!expose_blob_index_ && is_blob_) { + return blob_value_; + } else if (current_entry_is_merged_) { // If pinned_value_ is set then the result of merge operator is one of // the merge operands and we should return it. return pinned_value_.data() ? pinned_value_ : saved_value_; @@ -180,12 +185,15 @@ class DBIter final : public Iterator { Slice timestamp() const override { assert(valid_); assert(timestamp_size_ > 0); + if (direction_ == kReverse) { + return saved_timestamp_; + } const Slice ukey_and_ts = saved_key_.GetUserKey(); assert(timestamp_size_ < ukey_and_ts.size()); return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_); } bool IsBlob() const { - assert(valid_ && (allow_blob_ || !is_blob_)); + assert(valid_); return is_blob_; } @@ -227,7 +235,7 @@ class DBIter final : public Iterator { // If `skipping_saved_key` is true, the function will keep iterating until it // finds a user key that is larger than `saved_key_`. // If `prefix` is not null, the iterator needs to stop when all keys for the - // prefix are exhausted and the interator is set to invalid. + // prefix are exhausted and the iterator is set to invalid. bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); // Internal implementation of FindNextUserEntry(). bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); @@ -287,12 +295,20 @@ class DBIter final : public Iterator { : user_comparator_.CompareWithoutTimestamp(a, b); } + // Retrieves the blob value for the specified user key using the given blob + // index when using the integrated BlobDB implementation. + bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index); + + Status Merge(const Slice* val, const Slice& user_key); + const SliceTransform* prefix_extractor_; Env* const env_; + SystemClock* clock_; Logger* logger_; UserComparatorWrapper user_comparator_; const MergeOperator* const merge_operator_; IteratorWrapper iter_; + const Version* version_; ReadCallback* read_callback_; // Max visible sequence number. It is normally the snapshot seq unless we have // uncommitted data in db as in WriteUnCommitted. @@ -306,6 +322,7 @@ class DBIter final : public Iterator { std::string saved_value_; Slice pinned_value_; // for prefix seek mode to support prev() + PinnableSlice blob_value_; Statistics* statistics_; uint64_t max_skip_; uint64_t max_skippable_internal_keys_; @@ -335,7 +352,11 @@ class DBIter final : public Iterator { // Expect the inner iterator to maintain a total order. // prefix_extractor_ must be non-NULL if the value is false. const bool expect_total_order_inner_iter_; - bool allow_blob_; + ReadTier read_tier_; + bool verify_checksums_; + // Whether the iterator is allowed to expose blob references. Set to true when + // the stacked BlobDB implementation is used, false otherwise. + bool expose_blob_index_; bool is_blob_; bool arena_mode_; // List of operands for merge operator. @@ -357,18 +378,19 @@ class DBIter final : public Iterator { const Slice* const timestamp_ub_; const Slice* const timestamp_lb_; const size_t timestamp_size_; + std::string saved_timestamp_; }; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified `sequence` number // into appropriate user keys. extern Iterator* NewDBIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, + Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Comparator* user_key_comparator, InternalIterator* internal_iter, - const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl = nullptr, - ColumnFamilyData* cfd = nullptr, bool allow_blob = false); + const Version* version, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback, + DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, + bool expose_blob_index = false); } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index 484bb0b45a3..f2b200f68bf 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -511,9 +511,9 @@ TEST_F(DBIteratorStressTest, StressTest) { target_hidden_fraction; internal_iter->trace = trace; db_iter.reset(NewDBIterator( - env_, ropt, ImmutableCFOptions(options), + env_, ropt, ImmutableOptions(options), MutableCFOptions(options), BytewiseComparator(), - internal_iter, sequence, + internal_iter, nullptr /* version */, sequence, options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); } diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 1c9680da267..252b192fede 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -99,10 +99,11 @@ class TestIterator : public InternalIterator { } for (auto it = data_.begin(); it != data_.end(); ++it) { ParsedInternalKey ikey; - Status pikStatus = ParseInternalKey(it->first, &ikey); - pikStatus.PermitUncheckedError(); - assert(pikStatus.ok()); - if (!pikStatus.ok() || ikey.user_key != _key) { + Status pik_status = + ParseInternalKey(it->first, &ikey, true /* log_err_key */); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); + if (!pik_status.ok() || ikey.user_key != _key) { continue; } if (valid_ && data_.begin() + iter_ > it) { @@ -236,7 +237,7 @@ class DBIteratorTest : public testing::Test { TEST_F(DBIteratorTest, DBIteratorPrevNext) { Options options; - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); @@ -251,9 +252,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -284,9 +286,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -311,9 +314,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -344,9 +348,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -380,12 +385,14 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } // Test case to check SeekToLast with iterate_upper_bound set // (same key put may times - SeekToLast should start with the @@ -410,9 +417,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 7, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -448,9 +456,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -474,12 +483,14 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } // Test to check the SeekToLast() with the iterate_upper_bound set // (Deletion cases) @@ -497,9 +508,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -533,9 +545,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ro.iterate_upper_bound = &prefix; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 7, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -563,9 +576,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -584,6 +598,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -606,9 +621,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -638,9 +654,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { ReadOptions ro; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -660,7 +677,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { TEST_F(DBIteratorTest, DBIteratorEmpty) { Options options; - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); ReadOptions ro; @@ -669,11 +686,13 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -681,11 +700,13 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -704,9 +725,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -727,6 +749,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u); } @@ -734,7 +757,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -749,9 +772,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -770,6 +794,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -785,9 +810,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -801,6 +827,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -814,9 +841,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 202, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 202 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -835,6 +863,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -847,14 +876,17 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } TestIterator* internal_iter = new TestIterator(BytewiseComparator()); @@ -864,9 +896,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 200, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 200 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -874,6 +907,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -882,6 +916,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Next(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -899,9 +934,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -920,6 +956,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -934,9 +971,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, i + 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, i + 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -959,13 +997,14 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } } TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { Options options; - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); ReadOptions ro; @@ -984,9 +1023,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 0; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1014,7 +1054,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); - ASSERT_TRUE(db_iter->status().ok()); + ASSERT_OK(db_iter->status()); } // Test to make sure that the request will *not* fail as incomplete if @@ -1031,9 +1071,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1076,9 +1117,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1115,9 +1157,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1151,9 +1194,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1182,9 +1226,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1220,9 +1265,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = 2; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1258,9 +1304,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { ro.max_skippable_internal_keys = i; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1312,9 +1359,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) { options.max_sequential_skip_in_iterations = 1000; ro.max_skippable_internal_keys = i; std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -1351,9 +1399,10 @@ TEST_F(DBIteratorTest, DBIterator1) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 1 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1379,9 +1428,10 @@ TEST_F(DBIteratorTest, DBIterator2) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 0 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1404,9 +1454,10 @@ TEST_F(DBIteratorTest, DBIterator3) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1429,9 +1480,10 @@ TEST_F(DBIteratorTest, DBIterator4) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 4 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1448,7 +1500,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -1463,9 +1515,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1486,9 +1539,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1509,9 +1563,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1532,9 +1587,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 3, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 3 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1555,9 +1611,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1578,9 +1635,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 5, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1601,9 +1659,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 6, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1622,9 +1681,10 @@ TEST_F(DBIteratorTest, DBIterator5) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 10, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 10 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -1638,7 +1698,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -1653,9 +1713,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1676,9 +1737,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 1, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 1 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1699,9 +1761,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1722,9 +1785,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 3, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 3 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -1741,9 +1805,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1764,9 +1829,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 5, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1787,9 +1853,10 @@ TEST_F(DBIteratorTest, DBIterator6) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 6, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1803,7 +1870,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ReadOptions ro; Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); - ImmutableCFOptions cf_options = ImmutableCFOptions(options); + ImmutableOptions ioptions = ImmutableOptions(options); MutableCFOptions mutable_cf_options = MutableCFOptions(options); { @@ -1830,9 +1897,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 0, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 0 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1865,9 +1933,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 2, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 2 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1906,9 +1975,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 4, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 4 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1947,9 +2017,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 5, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 5 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1993,9 +2064,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 6, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 6 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2040,9 +2112,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 7, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 7 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2081,9 +2154,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 9, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 9 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2128,9 +2202,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 13, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 13 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2176,9 +2251,10 @@ TEST_F(DBIteratorTest, DBIterator7) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, cf_options, mutable_cf_options, BytewiseComparator(), - internal_iter, 14, options.max_sequential_skip_in_iterations, - nullptr /*read_callback*/)); + env_, ro, ioptions, mutable_cf_options, BytewiseComparator(), + internal_iter, nullptr /* version */, 14 /* sequence */, + options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2207,9 +2283,10 @@ TEST_F(DBIteratorTest, DBIterator8) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -2238,9 +2315,10 @@ TEST_F(DBIteratorTest, DBIterator9) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -2305,9 +2383,10 @@ TEST_F(DBIteratorTest, DBIterator10) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->Seek("c"); ASSERT_TRUE(db_iter->Valid()); @@ -2345,9 +2424,9 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, 0 /* force seek */, - nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2374,9 +2453,10 @@ TEST_F(DBIteratorTest, DBIterator11) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 1 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -2401,8 +2481,9 @@ TEST_F(DBIteratorTest, DBIterator12) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -2438,8 +2519,10 @@ TEST_F(DBIteratorTest, DBIterator13) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 2 /* sequence */, 3 /* max_sequential_skip_in_iterations */, + nullptr /* read_callback */)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), key); @@ -2466,8 +2549,10 @@ TEST_F(DBIteratorTest, DBIterator14) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 4 /* sequence */, 1 /* max_sequential_skip_in_iterations */, + nullptr /* read_callback */)); db_iter->Seek("b"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -2493,9 +2578,10 @@ TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 13, - options.max_sequential_skip_in_iterations, nullptr)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 13 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); // Expecting InternalKeys in [5,8] range with correct type int seqnums[4] = {5,8,11,13}; std::string user_keys[4] = {"1","2","3","4"}; @@ -2528,9 +2614,10 @@ TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 13, - options.max_sequential_skip_in_iterations, nullptr)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 13 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); // Expecting InternalKeys in [5,8] range with correct type int seqnums[4] = {5,8,11,13}; EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete, @@ -2578,10 +2665,10 @@ class DBIterWithMergeIterTest : public testing::Test { NewMergingIterator(&icomp_, &child_iters[0], 2u); db_iter_.reset(NewDBIterator( - env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_), - BytewiseComparator(), merge_iter, + env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_), + BytewiseComparator(), merge_iter, nullptr /* version */, 8 /* read data earlier than seqId 8 */, - 3 /* max iterators before reseek */, nullptr /*read_callback*/)); + 3 /* max iterators before reseek */, nullptr /* read_callback */)); } Env* env_; @@ -3018,9 +3105,10 @@ TEST_F(DBIteratorTest, SeekPrefixTombstones) { ro.prefix_same_as_start = true; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); int skipped_keys = 0; @@ -3054,15 +3142,16 @@ TEST_F(DBIteratorTest, SeekToFirstLowerBound) { ro.iterate_lower_bound = &lower_bound; Options options; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10 /* sequence */, - options.max_sequential_skip_in_iterations, + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); db_iter->SeekToFirst(); if (i == kNumKeys + 1) { // lower bound was beyond the last key ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); } else { ASSERT_TRUE(db_iter->Valid()); int expected; @@ -3093,9 +3182,10 @@ TEST_F(DBIteratorTest, PrevLowerBound) { ro.iterate_lower_bound = &lower_bound; Options options; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10 /* sequence */, - options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekToLast(); for (int i = kNumKeys; i >= kLowerBound; --i) { @@ -3121,9 +3211,10 @@ TEST_F(DBIteratorTest, SeekLessLowerBound) { ro.iterate_lower_bound = &lower_bound; Options options; std::unique_ptr db_iter(NewDBIterator( - env_, ro, ImmutableCFOptions(options), MutableCFOptions(options), - BytewiseComparator(), internal_iter, 10 /* sequence */, - options.max_sequential_skip_in_iterations, nullptr /* read_callback */)); + env_, ro, ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); auto before_lower_bound_str = std::to_string(kLowerBound - 1); Slice before_lower_bound(lower_bound_str); @@ -3146,9 +3237,10 @@ TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ReadOptions(), ImmutableCFOptions(options), - MutableCFOptions(options), BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations, nullptr /*read_callback*/)); + env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options), + BytewiseComparator(), internal_iter, nullptr /* version */, + 10 /* sequence */, options.max_sequential_skip_in_iterations, + nullptr /* read_callback */)); db_iter->SeekForPrev("a"); ASSERT_TRUE(db_iter->Valid()); diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 8d2b0a7c0f4..ae972ee967f 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -18,6 +18,7 @@ #include "rocksdb/perf_context.h" #include "table/block_based/flush_block_policy.h" #include "util/random.h" +#include "utilities/merge_operators/string_append/stringappend2.h" namespace ROCKSDB_NAMESPACE { @@ -67,8 +68,8 @@ TEST_P(DBIteratorTest, IteratorProperty) { // The test needs to be changed if kPersistedTier is supported in iterator. Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); - Put(1, "1", "2"); - Delete(1, "2"); + ASSERT_OK(Put(1, "1", "2")); + ASSERT_OK(Delete(1, "2")); ReadOptions ropt; ropt.pin_data = false; { @@ -172,10 +173,10 @@ TEST_P(DBIteratorTest, NonBlockingIteration) { TEST_P(DBIteratorTest, IterSeekBeforePrev) { ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("0", "f")); ASSERT_OK(Put("1", "h")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("2", "j")); auto iter = NewIterator(ReadOptions()); iter->Seek(Slice("c")); @@ -199,7 +200,7 @@ TEST_P(DBIteratorTest, IterReseekNewUpperBound) { ASSERT_OK(Put("aabb", rnd.RandomString(400))); ASSERT_OK(Put("aaef", rnd.RandomString(400))); ASSERT_OK(Put("b", rnd.RandomString(400))); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ReadOptions opts; Slice ub = Slice("aa"); opts.iterate_upper_bound = &ub; @@ -215,10 +216,10 @@ TEST_P(DBIteratorTest, IterReseekNewUpperBound) { TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) { ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("0", "f")); ASSERT_OK(Put("1", "h")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("2", "j")); auto iter = NewIterator(ReadOptions()); iter->SeekForPrev(Slice("0")); @@ -238,7 +239,7 @@ TEST_P(DBIteratorTest, IterLongKeys) { ASSERT_OK(Put(MakeLongKey(20, 0), "0")); ASSERT_OK(Put(MakeLongKey(32, 2), "2")); ASSERT_OK(Put("a", "b")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put(MakeLongKey(50, 1), "1")); ASSERT_OK(Put(MakeLongKey(127, 3), "3")); ASSERT_OK(Put(MakeLongKey(64, 4), "4")); @@ -276,7 +277,7 @@ TEST_P(DBIteratorTest, IterLongKeys) { TEST_P(DBIteratorTest, IterNextWithNewerSeq) { ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Put("d", "e")); @@ -302,7 +303,7 @@ TEST_P(DBIteratorTest, IterNextWithNewerSeq) { TEST_P(DBIteratorTest, IterPrevWithNewerSeq) { ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Put("d", "e")); @@ -333,7 +334,7 @@ TEST_P(DBIteratorTest, IterPrevWithNewerSeq) { TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) { ASSERT_OK(Put("0", "0")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Put("e", "f")); @@ -377,6 +378,8 @@ TEST_P(DBIteratorTest, IterEmpty) { iter->SeekForPrev("foo"); ASSERT_EQ(IterStatus(iter), "(invalid)"); + ASSERT_OK(iter->status()); + delete iter; } while (ChangeCompactOptions()); } @@ -617,6 +620,40 @@ TEST_P(DBIteratorTest, IterReseek) { delete iter; } +TEST_F(DBIteratorTest, ReseekUponDirectionChange) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.merge_operator.reset( + new StringAppendTESTOperator(/*delim_char=*/' ')); + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(Put("bar", "value")); + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->SeekToLast(); + it->Prev(); + it->Next(); + } + ASSERT_EQ(1, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + const std::string merge_key("good"); + ASSERT_OK(Put(merge_key, "orig")); + ASSERT_OK(Merge(merge_key, "suffix")); + { + std::unique_ptr it(db_->NewIterator(ReadOptions())); + it->Seek(merge_key); + ASSERT_TRUE(it->Valid()); + const uint64_t prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + it->Prev(); + ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount( + NUMBER_OF_RESEEKS_IN_ITERATION)); + } +} + TEST_P(DBIteratorTest, IterSmallAndLargeMix) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); @@ -783,18 +820,18 @@ TEST_P(DBIteratorTest, IterWithSnapshot) { TEST_P(DBIteratorTest, IteratorPinsRef) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "hello"); + ASSERT_OK(Put(1, "foo", "hello")); // Get iterator that will yield the current contents of the DB. Iterator* iter = NewIterator(ReadOptions(), handles_[1]); // Write to force compactions - Put(1, "foo", "newvalue1"); + ASSERT_OK(Put(1, "foo", "newvalue1")); for (int i = 0; i < 100; i++) { // 100K values ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v'))); } - Put(1, "foo", "newvalue2"); + ASSERT_OK(Put(1, "foo", "newvalue2")); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); @@ -809,8 +846,8 @@ TEST_P(DBIteratorTest, IteratorPinsRef) { TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "delete-cf-then-delete-iter"); - Put(1, "hello", "value2"); + ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter")); + ASSERT_OK(Put(1, "hello", "value2")); ColumnFamilyHandle* cf = handles_[1]; ReadOptions ro; @@ -820,7 +857,7 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) { ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter"); // delete CF handle - db_->DestroyColumnFamilyHandle(cf); + EXPECT_OK(db_->DestroyColumnFamilyHandle(cf)); handles_.erase(std::begin(handles_) + 1); // delete Iterator after CF handle is deleted @@ -832,7 +869,7 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) { TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - Put(1, "foo", "drop-cf-then-delete-iter"); + ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter")); ReadOptions ro; ColumnFamilyHandle* cf = handles_[1]; @@ -842,8 +879,8 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) { ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter"); // drop and delete CF - db_->DropColumnFamily(cf); - db_->DestroyColumnFamilyHandle(cf); + EXPECT_OK(db_->DropColumnFamily(cf)); + EXPECT_OK(db_->DestroyColumnFamilyHandle(cf)); handles_.erase(std::begin(handles_) + 1); // delete Iterator after CF handle is dropped @@ -1307,9 +1344,9 @@ TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) { // write three entries with different keys using Merge() WriteOptions wopts; - db_->Merge(wopts, "1", "data1"); - db_->Merge(wopts, "2", "data2"); - db_->Merge(wopts, "3", "data3"); + ASSERT_OK(db_->Merge(wopts, "1", "data1")); + ASSERT_OK(db_->Merge(wopts, "2", "data2")); + ASSERT_OK(db_->Merge(wopts, "3", "data3")); std::unique_ptr it(NewIterator(ReadOptions())); @@ -1393,7 +1430,7 @@ class DBIteratorTestForPinnedData : public DBIteratorTest { if (run_config == TestConfig::FLUSH_EVERY_1000) { if (i && i % 1000 == 0) { - Flush(); + ASSERT_OK(Flush()); } } } @@ -1402,7 +1439,7 @@ class DBIteratorTestForPinnedData : public DBIteratorTest { Close(); Reopen(options); } else if (run_config == TestConfig::COMPACT_BEFORE_READ) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } ReadOptions ro; @@ -1516,6 +1553,10 @@ TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) { PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000); } +INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance, + DBIteratorTestForPinnedData, + testing::Values(true, false)); + #ifndef ROCKSDB_LITE TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) { Options options = CurrentOptions(); @@ -1768,6 +1809,7 @@ TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) { Iterator* iter = NewIterator(ro); iter->SeekForPrev("c2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1823,6 +1865,7 @@ TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) { Iterator* iter = NewIterator(ro); iter->SeekForPrev("c2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -2151,19 +2194,19 @@ TEST_P(DBIteratorTest, ReadAhead) { std::string value(1024, 'a'); for (int i = 0; i < 100; i++) { - Put(Key(i), value); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); MoveFilesToLevel(2); for (int i = 0; i < 100; i++) { - Put(Key(i), value); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); MoveFilesToLevel(1); for (int i = 0; i < 100; i++) { - Put(Key(i), value); + ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); #ifndef ROCKSDB_LITE @@ -2270,6 +2313,7 @@ TEST_P(DBIteratorTest, Refresh) { ASSERT_OK(Put("x", "y")); std::unique_ptr iter(NewIterator(ReadOptions())); + ASSERT_OK(iter->status()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->key().compare(Slice("x")), 0); @@ -2284,7 +2328,8 @@ TEST_P(DBIteratorTest, Refresh) { iter->Next(); ASSERT_FALSE(iter->Valid()); - iter->Refresh(); + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); @@ -2295,7 +2340,7 @@ TEST_P(DBIteratorTest, Refresh) { iter->Next(); ASSERT_FALSE(iter->Valid()); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("m", "n")); @@ -2308,7 +2353,8 @@ TEST_P(DBIteratorTest, Refresh) { iter->Next(); ASSERT_FALSE(iter->Valid()); - iter->Refresh(); + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); @@ -2331,6 +2377,7 @@ TEST_P(DBIteratorTest, RefreshWithSnapshot) { ReadOptions options; options.snapshot = snapshot; Iterator* iter = NewIterator(options); + ASSERT_OK(iter->status()); iter->Seek(Slice("a")); ASSERT_TRUE(iter->Valid()); @@ -2346,8 +2393,8 @@ TEST_P(DBIteratorTest, RefreshWithSnapshot) { iter->Next(); ASSERT_FALSE(iter->Valid()); - Status s; - s = iter->Refresh(); + ASSERT_OK(iter->status()); + Status s = iter->Refresh(); ASSERT_TRUE(s.IsNotSupported()); db_->ReleaseSnapshot(snapshot); delete iter; @@ -2405,14 +2452,14 @@ TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) { TEST_P(DBIteratorTest, TableFilter) { ASSERT_OK(Put("a", "1")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("b", "2")); ASSERT_OK(Put("c", "3")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("d", "4")); ASSERT_OK(Put("e", "5")); ASSERT_OK(Put("f", "6")); - dbfull()->Flush(FlushOptions()); + EXPECT_OK(dbfull()->Flush(FlushOptions())); // Ensure the table_filter callback is called once for each table. { @@ -2597,13 +2644,13 @@ TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) { ReadOptions ropts; ropts.max_skippable_internal_keys = 2; - Put("1", "val_1"); + ASSERT_OK(Put("1", "val_1")); // Add more tombstones than max_skippable_internal_keys so that Next() fails. - Delete("2"); - Delete("3"); - Delete("4"); - Delete("5"); - Put("6", "val_6"); + ASSERT_OK(Delete("2")); + ASSERT_OK(Delete("3")); + ASSERT_OK(Delete("4")); + ASSERT_OK(Delete("5")); + ASSERT_OK(Put("6", "val_6")); std::unique_ptr iter(NewIterator(ropts)); iter->SeekToFirst(); @@ -2645,9 +2692,9 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) { DestroyAndReopen(options); // Two records in sst file, each in its own block. - Put("b", ""); - Put("d", ""); - Flush(); + ASSERT_OK(Put("b", "")); + ASSERT_OK(Put("d", "")); + ASSERT_OK(Flush()); // Create a nonblocking iterator before writing to memtable. ReadOptions ropt; @@ -2657,7 +2704,7 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) { // Overwrite a key in memtable many times to hit // max_sequential_skip_in_iterations (which is 8 by default). for (int i = 0; i < 20; ++i) { - Put("c", ""); + ASSERT_OK(Put("c", "")); } // Load the second block in sst file into the block cache. @@ -2674,9 +2721,9 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) { } TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) { - Put("a", ""); - Put("b", ""); - Flush(); + ASSERT_OK(Put("a", "")); + ASSERT_OK(Put("b", "")); + ASSERT_OK(Flush()); ReadOptions ropt; Slice ub = "b"; @@ -2883,6 +2930,127 @@ TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) { ASSERT_OK(iter->status()); } +TEST_P(DBIteratorTest, Blob) { + Options options = CurrentOptions(); + options.enable_blob_files = true; + options.max_sequential_skip_in_iterations = 2; + options.statistics = CreateDBStatistics(); + + Reopen(options); + + // Note: we have 4 KVs (3 of which are hidden) for key "b" and + // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek + // anytime we move from "b" to "c" or vice versa. + ASSERT_OK(Put("a", "va")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb1")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb2")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "vb3")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("c", "vc")); + ASSERT_OK(Flush()); + + std::unique_ptr iter_guard(NewIterator(ReadOptions())); + Iterator* const iter = iter_guard.get(); + + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + iter->Seek(""); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("a"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Seek("ax"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + iter->SeekForPrev("d"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("c"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("bx"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + iter->Seek("b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->Seek("z"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + iter->SeekForPrev(""); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + + // Switch from reverse to forward + iter->SeekToLast(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + ASSERT_EQ(IterStatus(iter), "b->vb3"); + + // Switch from forward to reverse + iter->SeekToFirst(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6); + iter->Next(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7); + iter->Prev(); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8); + ASSERT_EQ(IterStatus(iter), "b->vb3"); +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); @@ -3021,6 +3189,44 @@ TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { delete iter; } +TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.inplace_update_support = false; + options.env = env_; + DestroyAndReopen(options); + constexpr int kNumKeys = 10; + + // Write kNumKeys to WAL. + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(Put(Key(i), "val")); + } + ReadOptions read_opts; + read_opts.total_order_seek = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + int count = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ++count; + } + ASSERT_EQ(kNumKeys, count); + } + + // Reopen and rebuild the memtable from WAL. + options.create_if_missing = false; + options.avoid_flush_during_recovery = true; + options.inplace_update_support = true; + options.allow_concurrent_memtable_write = false; + Reopen(options); + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + // Backward iteration not supported due to inplace_update_support = true. + ASSERT_TRUE(iter->status().IsNotSupported()); + ASSERT_FALSE(iter->Valid()); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc new file mode 100644 index 00000000000..24411811928 --- /dev/null +++ b/db/db_kv_checksum_test.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +enum class WriteBatchOpType { + kPut = 0, + kDelete, + kSingleDelete, + kDeleteRange, + kMerge, + kBlobIndex, + kNum, +}; + +// Integer addition is needed for `::testing::Range()` to take the enum type. +WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) { + using T = std::underlying_type::type; + return static_cast(static_cast(lhs) + rhs); +} + +class DbKvChecksumTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + DbKvChecksumTest() + : DBTestBase("/db_kv_checksum_test", /*env_do_fsync=*/false) { + op_type_ = std::get<0>(GetParam()); + corrupt_byte_addend_ = std::get<1>(GetParam()); + } + + std::pair GetWriteBatch(size_t ts_sz, + ColumnFamilyHandle* cf_handle) { + Status s; + WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */, ts_sz, + 8 /* protection_bytes_per_entry */); + switch (op_type_) { + case WriteBatchOpType::kPut: + s = wb.Put(cf_handle, "key", "val"); + break; + case WriteBatchOpType::kDelete: + s = wb.Delete(cf_handle, "key"); + break; + case WriteBatchOpType::kSingleDelete: + s = wb.SingleDelete(cf_handle, "key"); + break; + case WriteBatchOpType::kDeleteRange: + s = wb.DeleteRange(cf_handle, "begin", "end"); + break; + case WriteBatchOpType::kMerge: + s = wb.Merge(cf_handle, "key", "val"); + break; + case WriteBatchOpType::kBlobIndex: + // TODO(ajkr): use public API once available. + uint32_t cf_id; + if (cf_handle == nullptr) { + cf_id = 0; + } else { + cf_id = cf_handle->GetID(); + } + s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", "val"); + break; + case WriteBatchOpType::kNum: + assert(false); + } + return {std::move(wb), std::move(s)}; + } + + void CorruptNextByteCallBack(void* arg) { + Slice encoded = *static_cast(arg); + if (entry_len_ == port::kMaxSizet) { + // We learn the entry size on the first attempt + entry_len_ = encoded.size(); + } + // All entries should be the same size + assert(entry_len_ == encoded.size()); + char* buf = const_cast(encoded.data()); + buf[corrupt_byte_offset_] += corrupt_byte_addend_; + ++corrupt_byte_offset_; + } + + bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; } + + protected: + WriteBatchOpType op_type_; + char corrupt_byte_addend_; + size_t corrupt_byte_offset_ = 0; + size_t entry_len_ = port::kMaxSizet; +}; + +std::string GetTestNameSuffix( + ::testing::TestParamInfo> info) { + std::ostringstream oss; + switch (std::get<0>(info.param)) { + case WriteBatchOpType::kPut: + oss << "Put"; + break; + case WriteBatchOpType::kDelete: + oss << "Delete"; + break; + case WriteBatchOpType::kSingleDelete: + oss << "SingleDelete"; + break; + case WriteBatchOpType::kDeleteRange: + oss << "DeleteRange"; + break; + case WriteBatchOpType::kMerge: + oss << "Merge"; + break; + case WriteBatchOpType::kBlobIndex: + oss << "BlobIndex"; + break; + case WriteBatchOpType::kNum: + assert(false); + } + oss << "Add" + << static_cast(static_cast(std::get<1>(info.param))); + return oss.str(); +} + +INSTANTIATE_TEST_CASE_P( + DbKvChecksumTest, DbKvChecksumTest, + ::testing::Combine(::testing::Range(static_cast(0), + WriteBatchOpType::kNum), + ::testing::Values(2, 103, 251)), + GetTestNameSuffix); + +TEST_P(DbKvChecksumTest, MemTableAddCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_`. Each attempt has one byte corrupted in its + // memtable entry by adding `corrupt_byte_addend_` to its original value. The + // test repeats until an attempt has been made on each byte in the encoded + // memtable entry. All attempts are expected to fail with `Status::Corruption` + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + + while (MoreBytesToCorrupt()) { + // Failed memtable insert always leads to read-only mode, so we have to + // reopen for every attempt. + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + Reopen(options); + + SyncPoint::GetInstance()->EnableProcessing(); + auto batch_and_status = + GetWriteBatch(0 /* ts_sz */, nullptr /* cf_handle */); + ASSERT_OK(batch_and_status.second); + ASSERT_TRUE( + db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + } +} + +TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) { + // This test repeatedly attempts to write `WriteBatch`es containing a single + // entry of type `op_type_` to a non-default column family. Each attempt has + // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_` + // to its original value. The test repeats until an attempt has been made on + // each byte in the encoded memtable entry. All attempts are expected to fail + // with `Status::Corruption`. + Options options = CurrentOptions(); + if (op_type_ == WriteBatchOpType::kMerge) { + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + } + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->SetCallBack( + "MemTable::Add:Encoded", + std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this, + std::placeholders::_1)); + + while (MoreBytesToCorrupt()) { + // Failed memtable insert always leads to read-only mode, so we have to + // reopen for every attempt. + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + + SyncPoint::GetInstance()->EnableProcessing(); + auto batch_and_status = GetWriteBatch(0 /* ts_sz */, handles_[1]); + ASSERT_OK(batch_and_status.second); + ASSERT_TRUE( + db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index 51d232a6a4f..9baf720375c 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -13,6 +13,7 @@ #if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" +#include "env/mock_env.h" #include "port/stack_trace.h" namespace ROCKSDB_NAMESPACE { @@ -33,9 +34,8 @@ class DBTestXactLogIterator : public DBTestBase { }; namespace { -SequenceNumber ReadRecords( - std::unique_ptr& iter, - int& count) { +SequenceNumber ReadRecords(std::unique_ptr& iter, + int& count, bool expect_ok = true) { count = 0; SequenceNumber lastSequence = 0; BatchResult res; @@ -47,6 +47,11 @@ SequenceNumber ReadRecords( EXPECT_OK(iter->status()); iter->Next(); } + if (expect_ok) { + EXPECT_OK(iter->status()); + } else { + EXPECT_NOK(iter->status()); + } return res.sequence; } @@ -64,9 +69,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIterator) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - Put(0, "key1", DummyString(1024)); - Put(1, "key2", DummyString(1024)); - Put(1, "key2", DummyString(1024)); + ASSERT_OK(Put(0, "key1", DummyString(1024))); + ASSERT_OK(Put(1, "key2", DummyString(1024))); + ASSERT_OK(Put(1, "key2", DummyString(1024))); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U); { auto iter = OpenTransactionLogIter(0); @@ -75,9 +80,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIterator) { ReopenWithColumnFamilies({"default", "pikachu"}, options); env_->SleepForMicroseconds(2 * 1000 * 1000); { - Put(0, "key4", DummyString(1024)); - Put(1, "key5", DummyString(1024)); - Put(0, "key6", DummyString(1024)); + ASSERT_OK(Put(0, "key4", DummyString(1024))); + ASSERT_OK(Put(1, "key5", DummyString(1024))); + ASSERT_OK(Put(0, "key6", DummyString(1024))); } { auto iter = OpenTransactionLogIter(0); @@ -109,15 +114,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - Put("key1", DummyString(1024)); - dbfull()->Flush(FlushOptions()); - Put("key2", DummyString(1024)); - dbfull()->Flush(FlushOptions()); - Put("key3", DummyString(1024)); - dbfull()->Flush(FlushOptions()); - Put("key4", DummyString(1024)); + ASSERT_OK(Put("key1", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key2", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key3", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(Put("key4", DummyString(1024))); ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->FlushWAL(false)); { auto iter = OpenTransactionLogIter(0); @@ -130,11 +135,11 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { // condition FlushOptions flush_options; flush_options.wait = false; - dbfull()->Flush(flush_options); + ASSERT_OK(dbfull()->Flush(flush_options)); // "key5" would be written in a new memtable and log - Put("key5", DummyString(1024)); - dbfull()->FlushWAL(false); + ASSERT_OK(Put("key5", DummyString(1024))); + ASSERT_OK(dbfull()->FlushWAL(false)); { // this iter would miss "key4" if not fixed auto iter = OpenTransactionLogIter(0); @@ -149,14 +154,14 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) { do { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - Put("key1", DummyString(1024)); + ASSERT_OK(Put("key1", DummyString(1024))); auto iter = OpenTransactionLogIter(0); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); iter->Next(); ASSERT_TRUE(!iter->Valid()); ASSERT_OK(iter->status()); - Put("key2", DummyString(1024)); + ASSERT_OK(Put("key2", DummyString(1024))); iter->Next(); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); @@ -167,9 +172,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) { do { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - Put("key1", DummyString(1024)); - Put("key2", DummyString(1023)); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("key1", DummyString(1024))); + ASSERT_OK(Put("key2", DummyString(1023))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); Reopen(options); auto iter = OpenTransactionLogIter(0); ExpectRecords(2, iter); @@ -181,10 +186,10 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); for (int i = 0; i < 1024; i++) { - Put("key"+ToString(i), DummyString(10)); + ASSERT_OK(Put("key" + ToString(i), DummyString(10))); } - dbfull()->Flush(FlushOptions()); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(dbfull()->FlushWAL(false)); // Corrupt this log to create a gap ROCKSDB_NAMESPACE::VectorLogPtr wal_files; ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); @@ -197,13 +202,13 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) { } // Insert a new entry to a new log file - Put("key1025", DummyString(10)); - dbfull()->FlushWAL(false); + ASSERT_OK(Put("key1025", DummyString(10))); + ASSERT_OK(dbfull()->FlushWAL(false)); // Try to read from the beginning. Should stop before the gap and read less // than 1025 entries auto iter = OpenTransactionLogIter(0); int count; - SequenceNumber last_sequence_read = ReadRecords(iter, count); + SequenceNumber last_sequence_read = ReadRecords(iter, count, false); ASSERT_LT(last_sequence_read, 1025U); // Try to read past the gap, should be able to seek to key1025 auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); @@ -217,15 +222,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); WriteBatch batch; - batch.Put(handles_[1], "key1", DummyString(1024)); - batch.Put(handles_[0], "key2", DummyString(1024)); - batch.Put(handles_[1], "key3", DummyString(1024)); - batch.Delete(handles_[0], "key2"); - dbfull()->Write(WriteOptions(), &batch); - Flush(1); - Flush(0); + ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024))); + ASSERT_OK(batch.Delete(handles_[0], "key2")); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(0)); ReopenWithColumnFamilies({"default", "pikachu"}, options); - Put(1, "key4", DummyString(1024)); + ASSERT_OK(Put(1, "key4", DummyString(1024))); auto iter = OpenTransactionLogIter(3); ExpectRecords(2, iter); } while (ChangeCompactOptions()); @@ -237,13 +242,13 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { CreateAndReopenWithCF({"pikachu"}, options); { WriteBatch batch; - batch.Put(handles_[1], "key1", DummyString(1024)); - batch.Put(handles_[0], "key2", DummyString(1024)); - batch.PutLogData(Slice("blob1")); - batch.Put(handles_[1], "key3", DummyString(1024)); - batch.PutLogData(Slice("blob2")); - batch.Delete(handles_[0], "key2"); - dbfull()->Write(WriteOptions(), &batch); + ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024))); + ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Delete(handles_[0], "key2")); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); ReopenWithColumnFamilies({"default", "pikachu"}, options); } @@ -268,7 +273,7 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { return Status::OK(); } } handler; - res.writeBatchPtr->Iterate(&handler); + ASSERT_OK(res.writeBatchPtr->Iterate(&handler)); ASSERT_EQ( "Put(1, key1, 1024)" "Put(0, key2, 1024)" diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc index 20f6abadca1..1057871c9f3 100644 --- a/db/db_logical_block_size_cache_test.cc +++ b/db/db_logical_block_size_cache_test.cc @@ -401,7 +401,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ColumnFamilyOptions cf_options0; cf_options0.cf_paths = {{cf_path_0_, 1024}}; ColumnFamilyHandle* cf0; - db0->CreateColumnFamily(cf_options0, "cf", &cf0); + ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0)); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); @@ -421,7 +421,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ColumnFamilyOptions cf_options1; cf_options1.cf_paths = {{cf_path_1_, 1024}}; ColumnFamilyHandle* cf1; - db1->CreateColumnFamily(cf_options1, "cf", &cf1); + ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1)); ASSERT_EQ(4, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); @@ -432,7 +432,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ASSERT_TRUE(cache_->Contains(cf_path_1_)); ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); - db0->DestroyColumnFamilyHandle(cf0); + ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); delete db0; ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_1_)); @@ -441,7 +441,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) { ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_)); ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}})); - db1->DestroyColumnFamilyHandle(cf1); + ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); delete db1; ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}})); @@ -466,7 +466,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); ColumnFamilyHandle* cf0; - db0->CreateColumnFamily(cf_options, "cf", &cf0); + ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0)); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); ASSERT_EQ(1, cache_->GetRefCount(data_path_0_)); @@ -482,14 +482,14 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); ColumnFamilyHandle* cf1; - db1->CreateColumnFamily(cf_options, "cf", &cf1); + ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1)); ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); ASSERT_EQ(2, cache_->GetRefCount(data_path_0_)); ASSERT_TRUE(cache_->Contains(cf_path_0_)); ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_)); - db0->DestroyColumnFamilyHandle(cf0); + ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0)); delete db0; ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); @@ -498,7 +498,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) { ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_)); ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}})); - db1->DestroyColumnFamilyHandle(cf1); + ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1)); delete db1; ASSERT_EQ(0, cache_->Size()); ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}})); diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 99763e3516e..cc4aaeb81b6 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -129,7 +129,6 @@ class TestPrefixExtractor : public SliceTransform { TEST_F(DBMemTableTest, DuplicateSeq) { SequenceNumber seq = 123; std::string value; - Status s; MergeContext merge_context; Options options; InternalKeyComparator ikey_cmp(options.comparator); @@ -140,28 +139,31 @@ TEST_F(DBMemTableTest, DuplicateSeq) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); // Write some keys and make sure it returns false on duplicates - bool res; - res = mem->Add(seq, kTypeValue, "key", "value2"); - ASSERT_TRUE(res); - res = mem->Add(seq, kTypeValue, "key", "value2"); - ASSERT_FALSE(res); + ASSERT_OK( + mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)); + ASSERT_TRUE( + mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */) + .IsTryAgain()); // Changing the type should still cause the duplicatae key - res = mem->Add(seq, kTypeMerge, "key", "value2"); - ASSERT_FALSE(res); + ASSERT_TRUE( + mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */) + .IsTryAgain()); // Changing the seq number will make the key fresh - res = mem->Add(seq + 1, kTypeMerge, "key", "value2"); - ASSERT_TRUE(res); + ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2", + nullptr /* kv_prot_info */)); // Test with different types for duplicate keys - res = mem->Add(seq, kTypeDeletion, "key", ""); - ASSERT_FALSE(res); - res = mem->Add(seq, kTypeSingleDeletion, "key", ""); - ASSERT_FALSE(res); + ASSERT_TRUE( + mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */) + .IsTryAgain()); + ASSERT_TRUE( + mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */) + .IsTryAgain()); // Test the duplicate keys under stress for (int i = 0; i < 10000; i++) { @@ -169,11 +171,12 @@ TEST_F(DBMemTableTest, DuplicateSeq) { if (!insert_dup) { seq++; } - res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq)); + Status s = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq), + nullptr /* kv_prot_info */); if (insert_dup) { - ASSERT_FALSE(res); + ASSERT_TRUE(s.IsTryAgain()); } else { - ASSERT_TRUE(res); + ASSERT_OK(s); } } delete mem; @@ -181,26 +184,28 @@ TEST_F(DBMemTableTest, DuplicateSeq) { // Test with InsertWithHint options.memtable_insert_with_hint_prefix_extractor.reset( new TestPrefixExtractor()); // which uses _ to extract the prefix - ioptions = ImmutableCFOptions(options); + ioptions = ImmutableOptions(options); mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); // Insert a duplicate key with _ in it - res = mem->Add(seq, kTypeValue, "key_1", "value"); - ASSERT_TRUE(res); - res = mem->Add(seq, kTypeValue, "key_1", "value"); - ASSERT_FALSE(res); + ASSERT_OK( + mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)); + ASSERT_TRUE( + mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */) + .IsTryAgain()); delete mem; // Test when InsertConcurrently will be invoked options.allow_concurrent_memtable_write = true; - ioptions = ImmutableCFOptions(options); + ioptions = ImmutableOptions(options); mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); MemTablePostProcessInfo post_process_info; - res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info); - ASSERT_TRUE(res); - res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info); - ASSERT_FALSE(res); + ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value", + nullptr /* kv_prot_info */, true, &post_process_info)); + ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value", + nullptr /* kv_prot_info */, true, &post_process_info) + .IsTryAgain()); delete mem; } @@ -208,7 +213,6 @@ TEST_F(DBMemTableTest, DuplicateSeq) { TEST_F(DBMemTableTest, ConcurrentMergeWrite) { int num_ops = 1000; std::string value; - Status s; MergeContext merge_context; Options options; // A merge operator that is not sensitive to concurrent writes since in this @@ -220,15 +224,14 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { auto factory = std::make_shared(); options.memtable_factory = factory; options.allow_concurrent_memtable_write = true; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); // Put 0 as the base PutFixed64(&value, static_cast(0)); - bool res = mem->Add(0, kTypeValue, "key", value); - ASSERT_TRUE(res); + ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */)); value.clear(); // Write Merge concurrently @@ -237,9 +240,8 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { std::string v1; for (int seq = 1; seq < num_ops / 2; seq++) { PutFixed64(&v1, seq); - bool res1 = - mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1); - ASSERT_TRUE(res1); + ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */, + true, &post_process_info1)); v1.clear(); } }); @@ -248,9 +250,8 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { std::string v2; for (int seq = num_ops / 2; seq < num_ops; seq++) { PutFixed64(&v2, seq); - bool res2 = - mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2); - ASSERT_TRUE(res2); + ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */, + true, &post_process_info2)); v2.clear(); } }); @@ -261,8 +262,9 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) { ReadOptions roptions; SequenceNumber max_covering_tombstone_seq = 0; LookupKey lkey("key", kMaxSequenceNumber); - res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status, &merge_context, - &max_covering_tombstone_seq, roptions); + bool res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status, + &merge_context, &max_covering_tombstone_seq, roptions); + ASSERT_OK(status); ASSERT_TRUE(res); uint64_t ivalue = DecodeFixed64(Slice(value).data()); uint64_t sum = 0; @@ -316,6 +318,7 @@ TEST_F(DBMemTableTest, InsertWithHint) { TEST_F(DBMemTableTest, ColumnFamilyId) { // Verifies MemTableRepFactory is told the right column family id. Options options; + options.env = CurrentOptions().env; options.allow_concurrent_memtable_write = false; options.create_if_missing = true; options.memtable_factory.reset(new MockMemTableRepFactory()); diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc index 7ec256213a4..45bafb44c75 100644 --- a/db/db_merge_operand_test.cc +++ b/db/db_merge_operand_test.cc @@ -19,33 +19,35 @@ namespace ROCKSDB_NAMESPACE { -class DBMergeOperandTest : public DBTestBase { +namespace { +class LimitedStringAppendMergeOp : public StringAppendTESTOperator { public: - DBMergeOperandTest() - : DBTestBase("/db_merge_operand_test", /*env_do_fsync=*/true) {} -}; + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} -TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { - class LimitedStringAppendMergeOp : public StringAppendTESTOperator { - public: - LimitedStringAppendMergeOp(int limit, char delim) - : StringAppendTESTOperator(delim), limit_(limit) {} + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } - const char* Name() const override { - return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; } + return false; + } - bool ShouldMerge(const std::vector& operands) const override { - if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { - return true; - } - return false; - } + private: + size_t limit_ = 0; +}; +} // namespace - private: - size_t limit_ = 0; - }; +class DBMergeOperandTest : public DBTestBase { + public: + DBMergeOperandTest() + : DBTestBase("/db_merge_operand_test", /*env_do_fsync=*/true) {} +}; +TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { Options options; options.create_if_missing = true; // Use only the latest two merge operands. @@ -59,29 +61,29 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { merge_operands_info.expected_max_number_of_operands = num_records; // k0 value in memtable - Put("k0", "PutARock"); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(Put("k0", "PutARock")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k0", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "PutARock"); // k0.1 value in SST - Put("k0.1", "RockInSST"); + ASSERT_OK(Put("k0.1", "RockInSST")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k0.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "RockInSST"); // All k1 values are in memtable. ASSERT_OK(Merge("k1", "a")); - Put("k1", "x"); + ASSERT_OK(Put("k1", "x")); ASSERT_OK(Merge("k1", "b")); ASSERT_OK(Merge("k1", "c")); ASSERT_OK(Merge("k1", "d")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "x"); ASSERT_EQ(values[1], "b"); ASSERT_EQ(values[2], "c"); @@ -98,13 +100,13 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { // All k1.1 values are in memtable. ASSERT_OK(Merge("k1.1", "r")); - Delete("k1.1"); + ASSERT_OK(Delete("k1.1")); ASSERT_OK(Merge("k1.1", "c")); ASSERT_OK(Merge("k1.1", "k")); ASSERT_OK(Merge("k1.1", "s")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "c"); ASSERT_EQ(values[1], "k"); ASSERT_EQ(values[2], "s"); @@ -115,9 +117,9 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { ASSERT_OK(Merge("k2", "e")); ASSERT_OK(Merge("k2", "r")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "q"); ASSERT_EQ(values[1], "w"); ASSERT_EQ(values[2], "e"); @@ -125,30 +127,30 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { // All k2.1 values are flushed to L0 into a single file. ASSERT_OK(Merge("k2.1", "m")); - Put("k2.1", "l"); + ASSERT_OK(Put("k2.1", "l")); ASSERT_OK(Merge("k2.1", "n")); ASSERT_OK(Merge("k2.1", "o")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "l,n,o"); // All k2.2 values are flushed to L0 into a single file. ASSERT_OK(Merge("k2.2", "g")); - Delete("k2.2"); + ASSERT_OK(Delete("k2.2")); ASSERT_OK(Merge("k2.2", "o")); ASSERT_OK(Merge("k2.2", "t")); ASSERT_OK(Flush()); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2.2", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "o,t"); // Do some compaction that will make the following tests more predictable // Slice start("PutARock"); // Slice end("t"); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // All k3 values are flushed and are in different files. ASSERT_OK(Merge("k3", "ab")); @@ -158,9 +160,9 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { ASSERT_OK(Merge("k3", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3", "de")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "ab"); ASSERT_EQ(values[1], "bc"); ASSERT_EQ(values[2], "cd"); @@ -169,14 +171,14 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { // All k3.1 values are flushed and are in different files. ASSERT_OK(Merge("k3.1", "ab")); ASSERT_OK(Flush()); - Put("k3.1", "bc"); + ASSERT_OK(Put("k3.1", "bc")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.1", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.1", "de")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3.1", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "bc"); ASSERT_EQ(values[1], "cd"); ASSERT_EQ(values[2], "de"); @@ -184,14 +186,14 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { // All k3.2 values are flushed and are in different files. ASSERT_OK(Merge("k3.2", "ab")); ASSERT_OK(Flush()); - Delete("k3.2"); + ASSERT_OK(Delete("k3.2")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.2", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3.2", "de")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3.2", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "cd"); ASSERT_EQ(values[1], "de"); @@ -206,32 +208,120 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { ASSERT_OK(Flush()); MoveFilesToLevel(1); ASSERT_OK(Merge("k4", "ed")); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k4", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "ba"); ASSERT_EQ(values[1], "cb"); ASSERT_EQ(values[2], "dc"); ASSERT_EQ(values[3], "ed"); - // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable + // First 3 k5 values are in SST and next 4 k5 values are in Immutable + // Memtable ASSERT_OK(Merge("k5", "who")); ASSERT_OK(Merge("k5", "am")); ASSERT_OK(Merge("k5", "i")); ASSERT_OK(Flush()); - Put("k5", "remember"); + ASSERT_OK(Put("k5", "remember")); ASSERT_OK(Merge("k5", "i")); ASSERT_OK(Merge("k5", "am")); ASSERT_OK(Merge("k5", "rocks")); - dbfull()->TEST_SwitchMemtable(); - db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5", - values.data(), &merge_operands_info, - &number_of_operands); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k5", values.data(), &merge_operands_info, + &number_of_operands)); ASSERT_EQ(values[0], "remember"); ASSERT_EQ(values[1], "i"); ASSERT_EQ(values[2], "am"); } +TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) { + Options options; + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1 values are in memtable. + ASSERT_OK(Put("k1", "x")); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Put("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k2", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "q,w,e,r"); + + // Do some compaction that will make the following tests more predictable + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Put("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k3", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All K4 values are in different levels + ASSERT_OK(Put("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k4", values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index 3db68685185..4b819bd624a 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -37,7 +37,7 @@ class TestReadCallback : public ReadCallback { class DBMergeOperatorTest : public DBTestBase { public: DBMergeOperatorTest() - : DBTestBase("/db_merge_operator_test", /*env_do_fsync=*/true) {} + : DBTestBase("/db_merge_operator_test", /*env_do_fsync=*/false) {} std::string GetWithReadCallback(SnapshotChecker* snapshot_checker, const Slice& key, @@ -94,7 +94,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { ASSERT_OK(Merge("k1", "c")); ASSERT_OK(Merge("k1", "d")); std::string value; - ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k1", &value)); // Make sure that only the latest two merge operands are used. If this was // not the case the value would be "a,b,c,d". ASSERT_EQ(value, "c,d"); @@ -105,7 +105,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { ASSERT_OK(Merge("k2", "c")); ASSERT_OK(Merge("k2", "d")); ASSERT_OK(Flush()); - ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k2", &value)); ASSERT_EQ(value, "c,d"); // All K3 values are flushed and are in different files. @@ -116,7 +116,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { ASSERT_OK(Merge("k3", "cd")); ASSERT_OK(Flush()); ASSERT_OK(Merge("k3", "de")); - ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k3", &value)); ASSERT_EQ(value, "cd,de"); // All K4 values are in different levels @@ -130,7 +130,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { ASSERT_OK(Flush()); MoveFilesToLevel(1); ASSERT_OK(Merge("k4", "de")); - ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok()); + ASSERT_OK(db_->Get(ReadOptions(), "k4", &value)); ASSERT_EQ(value, "cd,de"); } @@ -344,8 +344,9 @@ TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) { // Code executed before merge operation merge_hook->before_merge_ = [&]() { // Evict all tables from cache before every merge operation + auto* table_cache = dbfull()->TEST_table_cache(); for (uint64_t num : file_numbers) { - TableCache::Evict(dbfull()->TEST_table_cache(), num); + TableCache::Evict(table_cache, num); } // Decrease cache capacity to force all unrefed blocks to be evicted if (bbto.block_cache) { @@ -366,7 +367,7 @@ TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) { VerifyDBFromMap(true_data, &total_reads); ASSERT_EQ(merge_cnt, total_reads); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); VerifyDBFromMap(true_data, &total_reads); } @@ -385,7 +386,7 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) { std::function writer_func = [&]() { int k = 0; for (int i = 0; i < kNumWrites; i++) { - db_->Merge(WriteOptions(), Key(k), Key(k)); + ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k))); if (i && i % kNumOperands == 0) { k++; @@ -403,7 +404,7 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) { ReadOptions ro; ro.tailing = true; Iterator* iter = db_->NewIterator(ro); - + ASSERT_OK(iter->status()); iter->SeekToFirst(); for (int i = 0; i < (kNumWrites / kNumOperands); i++) { while (!iter->Valid()) { @@ -416,6 +417,7 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) { iter->Next(); } + ASSERT_OK(iter->status()); delete iter; }; @@ -449,12 +451,13 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { // ForwardIterator to not pin it in some circumstances. This test // reproduces it. - db_->Merge(WriteOptions(), "key", "sst"); - db_->Flush(FlushOptions()); // Switch to SuperVersion A - db_->Merge(WriteOptions(), "key", "memtable"); + ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst")); + ASSERT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion A + ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable")); // Pin SuperVersion A std::unique_ptr someone_else(db_->NewIterator(ReadOptions())); + ASSERT_OK(someone_else->status()); bool pushed_first_operand = false; bool stepped_to_next_operand = false; @@ -462,7 +465,7 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) { EXPECT_FALSE(pushed_first_operand); pushed_first_operand = true; - db_->Flush(FlushOptions()); // Switch to SuperVersion B + EXPECT_OK(db_->Flush(FlushOptions())); // Switch to SuperVersion B }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) { @@ -477,7 +480,7 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { std::unique_ptr iter(db_->NewIterator(ro)); iter->Seek("key"); - ASSERT_TRUE(iter->status().ok()); + ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString()); EXPECT_TRUE(pushed_first_operand); diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 0ab06489c8f..96fd37357dd 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -34,7 +34,7 @@ class DBOptionsTest : public DBTestBase { const DBOptions& options) { std::string options_str; std::unordered_map mutable_map; - ConfigOptions config_options; + ConfigOptions config_options(options); config_options.delimiter = "; "; EXPECT_OK(GetStringFromMutableDBOptions( @@ -79,6 +79,22 @@ class DBOptionsTest : public DBTestBase { #endif // ROCKSDB_LITE }; +TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) { + Options options; + options.env = env_; + options.track_and_verify_wals_in_manifest = true; + + ImmutableDBOptions db_options(options); + ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest); + + Reopen(options); + ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest); + + Status s = + dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}}); + ASSERT_FALSE(s.ok()); +} + // RocksDB lite don't support dynamic options. #ifndef ROCKSDB_LITE @@ -113,6 +129,83 @@ TEST_F(DBOptionsTest, GetLatestCFOptions) { GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1]))); } +TEST_F(DBOptionsTest, SetMutableTableOptions) { + Options options; + options.create_if_missing = true; + options.env = env_; + options.blob_file_size = 16384; + BlockBasedTableOptions bbto; + bbto.no_block_cache = true; + bbto.block_size = 8192; + bbto.block_restart_interval = 7; + + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); + Options c_opts = dbfull()->GetOptions(cfh); + const auto* c_bbto = + c_opts.table_factory->GetOptions(); + ASSERT_NE(c_bbto, nullptr); + ASSERT_EQ(c_opts.blob_file_size, 16384); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 8192); + ASSERT_EQ(c_bbto->block_restart_interval, 7); + ASSERT_OK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "16384"}, + {"table_factory.block_restart_interval", "11"}})); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Now set an option that is not mutable - options should not change + ASSERT_NOK( + dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Set some that are mutable and some that are not - options should not change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.no_block_cache", "false"}, + {"table_factory.block_size", "8192"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Set some that are mutable and some that do not exist - options should not + // change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "8192"}, + {"table_factory.does_not_exist", "true"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->no_block_cache, true); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 11); + + // Trying to change the table factory fails + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory", TableFactory::kPlainTableName()}})); + + // Set some on the table and some on the Column Family + ASSERT_OK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "16384"}, + {"blob_file_size", "32768"}, + {"table_factory.block_restart_interval", "13"}})); + c_opts = dbfull()->GetOptions(cfh); + ASSERT_EQ(c_opts.blob_file_size, 32768); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 13); + // Set some on the table and a bad one on the ColumnFamily - options should + // not change + ASSERT_NOK(dbfull()->SetOptions( + cfh, {{"table_factory.block_size", "1024"}, + {"no_such_option", "32768"}, + {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto->block_size, 16384); + ASSERT_EQ(c_bbto->block_restart_interval, 13); +} + TEST_F(DBOptionsTest, SetBytesPerSync) { const size_t kValueSize = 1024 * 1024; // 1MB Options options; @@ -174,10 +267,11 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) { options.env = env_; Reopen(options); ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync); - int counter = 0; + std::atomic_int counter{0}; int low_bytes_per_sync = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; }); + "WritableFileWriter::RangeSync:0", + [&](void* /*arg*/) { counter.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); const std::string kValue(kValueSize, 'v'); int i = 0; @@ -606,6 +700,7 @@ TEST_F(DBOptionsTest, MaxOpenFilesChange) { TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) { Options options; + options.env = CurrentOptions().env; options.delayed_write_rate = 0; Reopen(options); ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); @@ -617,6 +712,7 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) { TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) { Options options; + options.env = CurrentOptions().env; options.compaction_style = kCompactionStyleUniversal; options.ttl = 0; @@ -646,6 +742,7 @@ TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) { TEST_F(DBOptionsTest, SanitizeTtlDefault) { Options options; + options.env = CurrentOptions().env; Reopen(options); ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); @@ -662,6 +759,7 @@ TEST_F(DBOptionsTest, SanitizeTtlDefault) { TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { Options options; options.compaction_style = kCompactionStyleFIFO; + options.env = CurrentOptions().env; options.ttl = 0; Reopen(options); ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); @@ -687,6 +785,7 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { Options options; + options.env = CurrentOptions().env; options.compaction_style = kCompactionStyleFIFO; options.write_buffer_size = 10 << 10; // 10KB options.arena_block_size = 4096; @@ -708,7 +807,7 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { for (int j = 0; j < 10; j++) { ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980))); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 10); @@ -826,6 +925,7 @@ TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { options.compaction_style = kCompactionStyleFIFO; options.write_buffer_size = 10 << 10; // 10KB options.create_if_missing = true; + options.env = CurrentOptions().env; ASSERT_OK(TryReopen(options)); @@ -879,6 +979,7 @@ TEST_F(DBOptionsTest, ChangeCompression) { options.bottommost_compression = CompressionType::kNoCompression; options.bottommost_compression_opts.level = 2; options.bottommost_compression_opts.parallel_threads = 1; + options.env = CurrentOptions().env; ASSERT_OK(TryReopen(options)); @@ -929,6 +1030,66 @@ TEST_F(DBOptionsTest, ChangeCompression) { #endif // ROCKSDB_LITE +TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) { + // Verify the bottommost compression options still take effect even when the + // bottommost compression type is left at its default value. Verify for both + // automatic and manual compaction. + if (!Snappy_Supported() || !LZ4_Supported()) { + return; + } + + constexpr int kUpperCompressionLevel = 1; + constexpr int kBottommostCompressionLevel = 2; + constexpr int kNumL0Files = 2; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.compression = CompressionType::kLZ4Compression; + options.compression_opts.level = kUpperCompressionLevel; + options.bottommost_compression_opts.level = kBottommostCompressionLevel; + options.bottommost_compression_opts.enabled = true; + Reopen(options); + + CompressionType compression_used = CompressionType::kDisableCompressionOption; + CompressionOptions compression_opt_used; + bool compacted = false; + SyncPoint::GetInstance()->SetCallBack( + "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) { + Compaction* c = static_cast(arg); + compression_used = c->output_compression(); + compression_opt_used = c->output_compression_opts(); + compacted = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // First, verify for automatic compaction. + for (int i = 0; i < kNumL0Files; ++i) { + ASSERT_OK(Put("foo", "foofoofoo")); + ASSERT_OK(Put("bar", "foofoofoo")); + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kLZ4Compression, compression_used); + ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); + + // Second, verify for manual compaction. + compacted = false; + compression_used = CompressionType::kDisableCompressionOption; + compression_opt_used = CompressionOptions(); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_TRUE(compacted); + ASSERT_EQ(CompressionType::kLZ4Compression, compression_used); + ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index ff6ded37fa7..8945ee29148 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -336,7 +336,7 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) { table_options.filter_policy.reset( NewBloomFilterPolicy(kBloomBitsPerKey, false)); table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -536,7 +536,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { table_options.filter_policy.reset( NewBloomFilterPolicy(kBloomBitsPerKey, false)); table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -1175,6 +1175,61 @@ class CountingDeleteTabPropCollectorFactory } }; +class BlockCountingTablePropertiesCollector : public TablePropertiesCollector { + public: + static const std::string kNumSampledBlocksPropertyName; + + const char* Name() const override { + return "BlockCountingTablePropertiesCollector"; + } + + Status Finish(UserCollectedProperties* properties) override { + (*properties)[kNumSampledBlocksPropertyName] = + ToString(num_sampled_blocks_); + return Status::OK(); + } + + Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/, + EntryType /*type*/, SequenceNumber /*seq*/, + uint64_t /*file_size*/) override { + return Status::OK(); + } + + void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override { + if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) { + num_sampled_blocks_++; + } + } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{ + {kNumSampledBlocksPropertyName, ToString(num_sampled_blocks_)}, + }; + } + + private: + uint32_t num_sampled_blocks_ = 0; +}; + +const std::string + BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName = + "NumSampledBlocks"; + +class BlockCountingTablePropertiesCollectorFactory + : public TablePropertiesCollectorFactory { + public: + const char* Name() const override { + return "BlockCountingTablePropertiesCollectorFactory"; + } + + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context /* context */) override { + return new BlockCountingTablePropertiesCollector(); + } +}; + #ifndef ROCKSDB_LITE TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { Options options = CurrentOptions(); @@ -1413,8 +1468,134 @@ TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) { } } +// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. +TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) { + // Sampled compression requires at least one of the following four types. + if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() && + !ZSTD_Supported()) { + return; + } + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.table_properties_collector_factories.emplace_back( + std::make_shared()); + + for (bool sample_for_compression : {false, true}) { + // For simplicity/determinism, sample 100% when enabled, or 0% when disabled + options.sample_for_compression = sample_for_compression ? 1 : 0; + + DestroyAndReopen(options); + + // Setup the following LSM: + // + // L0_0 ["a", "b"] + // L1_0 ["a", "b"] + // + // L0_0 was created by flush. L1_0 was created by compaction. Each file + // contains one data block. + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("a", "val")); + ASSERT_OK(Put("b", "val")); + ASSERT_OK(Flush()); + if (i == 1) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + // A `BlockAdd()` should have been seen for files generated by flush or + // compaction when `sample_for_compression` is enabled. + TablePropertiesCollection file_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); + ASSERT_EQ(2, file_to_props.size()); + for (const auto& file_and_props : file_to_props) { + auto& user_props = file_and_props.second->user_collected_properties; + ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector:: + kNumSampledBlocksPropertyName) != + user_props.end()); + ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector:: + kNumSampledBlocksPropertyName), + ToString(sample_for_compression ? 1 : 0)); + } + } +} + +class CompressionSamplingDBPropertiesTest + : public DBPropertiesTest, + public ::testing::WithParamInterface { + public: + CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {} + + protected: + const bool fast_; +}; + +INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest, + CompressionSamplingDBPropertiesTest, ::testing::Bool()); + +// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage. +TEST_P(CompressionSamplingDBPropertiesTest, + EstimateDataSizeWithCompressionSampling) { + Options options = CurrentOptions(); + if (fast_) { + // One of the following light compression libraries must be present. + if (LZ4_Supported()) { + options.compression = kLZ4Compression; + } else if (Snappy_Supported()) { + options.compression = kSnappyCompression; + } else { + return; + } + } else { + // One of the following heavy compression libraries must be present. + if (ZSTD_Supported()) { + options.compression = kZSTD; + } else if (Zlib_Supported()) { + options.compression = kZlibCompression; + } else { + return; + } + } + options.disable_auto_compactions = true; + // For simplicity/determinism, sample 100%. + options.sample_for_compression = 1; + Reopen(options); + + // Setup the following LSM: + // + // L0_0 ["a", "b"] + // L1_0 ["a", "b"] + // + // L0_0 was created by flush. L1_0 was created by compaction. Each file + // contains one data block. The value consists of compressible data so the + // data block should be stored compressed. + std::string val(1024, 'a'); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("a", val)); + ASSERT_OK(Put("b", val)); + ASSERT_OK(Flush()); + if (i == 1) { + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + } + } + + TablePropertiesCollection file_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props)); + ASSERT_EQ(2, file_to_props.size()); + for (const auto& file_and_props : file_to_props) { + ASSERT_GT(file_and_props.second->data_size, 0); + if (fast_) { + ASSERT_EQ(file_and_props.second->data_size, + file_and_props.second->fast_compression_estimated_data_size); + } else { + ASSERT_EQ(file_and_props.second->data_size, + file_and_props.second->slow_compression_estimated_data_size); + } + } +} + TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) { - Options options; + Options options = CurrentOptions(); Reopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Delete("foo")); @@ -1445,6 +1626,7 @@ TEST_F(DBPropertiesTest, EstimateOldestKeyTime) { options.compaction_style = kCompactionStyleFIFO; options.ttl = 300; + options.max_open_files = -1; options.compaction_options_fifo.allow_compaction = false; DestroyAndReopen(options); @@ -1524,6 +1706,7 @@ TEST_F(DBPropertiesTest, SstFilesSize) { std::shared_ptr listener = std::make_shared(); Options options; + options.env = CurrentOptions().env; options.disable_auto_compactions = true; options.listeners.push_back(listener); Reopen(options); @@ -1608,6 +1791,8 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { Options options; uint64_t value; + options.env = CurrentOptions().env; + // Block cache properties are not available for tables other than // block-based table. options.table_factory.reset(NewPlainTableFactory()); diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 706f95e49d3..10720e758dd 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE { class DBRangeDelTest : public DBTestBase { public: - DBRangeDelTest() : DBTestBase("/db_range_del_test", /*env_do_fsync=*/true) {} + DBRangeDelTest() : DBTestBase("/db_range_del_test", /*env_do_fsync=*/false) {} std::string GetNumericStr(int key) { uint64_t uint64_key = static_cast(key); @@ -56,7 +56,7 @@ TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) { } TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) { - db_->Put(WriteOptions(), "b", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "b", "val")); ASSERT_TRUE( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a") .IsInvalidArgument()); @@ -73,6 +73,15 @@ TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) { } while (ChangeOptions(kRangeDelSkipConfigs)); } +TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) { + Options opts = CurrentOptions(); + opts.compression_opts.max_dict_bytes = 16384; + Reopen(opts); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr2")); + ASSERT_OK(db_->Flush(FlushOptions())); +} + TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) { do { Options opts = CurrentOptions(); @@ -82,13 +91,14 @@ TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) { // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"); - db_->Flush(FlushOptions()); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_EQ(1, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); @@ -118,7 +128,8 @@ TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) { // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); Random rnd(301); for (int i = 0; i < kNumFiles; ++i) { @@ -128,18 +139,18 @@ TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) { values.push_back(rnd.RandomString(3 << 10)); ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); if (j == 0 && i > 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } } // put extra key to trigger final flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(2, NumTableFilesAtLevel(1)); db_->ReleaseSnapshot(snapshot); @@ -159,7 +170,7 @@ TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) { // Want max_compaction_bytes to trigger the end of compaction output file, not // target_file_size_base, so make the latter much bigger opts.target_file_size_base = 100 * opts.max_compaction_bytes; - Reopen(opts); + DestroyAndReopen(opts); // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); @@ -178,12 +189,12 @@ TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) { } // extra entry to trigger SpecialSkipListFactory's flush ASSERT_OK(Put(GetNumericStr(kNumPerFile), "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GE(NumTableFilesAtLevel(1), 2); @@ -221,10 +232,10 @@ TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) { } TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) { - db_->Put(WriteOptions(), "b1", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "b1", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); - db_->Put(WriteOptions(), "b2", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "b2", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b")); // first iteration verifies query correctness in memtable, second verifies @@ -241,8 +252,9 @@ TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) { } TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) { - db_->Put(WriteOptions(), "unused", "val"); // prevents empty after compaction - db_->Put(WriteOptions(), "b1", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "unused", + "val")); // prevents empty after compaction + ASSERT_OK(db_->Put(WriteOptions(), "b1", "val")); ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); @@ -254,8 +266,8 @@ TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) { for (int i = 0; i < 2; ++i) { if (i > 0) { - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); } @@ -269,7 +281,7 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250; Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); - Reopen(opts); + DestroyAndReopen(opts); // Write a third before snapshot, a third between snapshot and tombstone, and // a third after the tombstone. Keys older than snapshot or newer than the @@ -279,12 +291,13 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { if (i == kNum / 3) { snapshot = db_->GetSnapshot(); } else if (i == 2 * kNum / 3) { - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); for (int i = 0; i < kNum; ++i) { ReadOptions read_opts; @@ -309,29 +322,32 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) { opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); opts.num_levels = 2; opts.statistics = CreateDBStatistics(); - Reopen(opts); + DestroyAndReopen(opts); for (int i = 0; i < kNumFiles; ++i) { if (i > 0) { // range tombstone covers first half of the previous file - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr((i - 1) * kNumPerFile), - GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)); + ASSERT_OK(db_->DeleteRange( + WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr((i - 1) * kNumPerFile), + GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2))); } // Make sure a given key appears in each file so compaction won't be able to // use trivial move, which would happen if the ranges were non-overlapping. // Also, we need an extra element since flush is only triggered when the // number of keys is one greater than SpecialSkipListFactory's limit. // We choose a key outside the key-range used by the test to avoid conflict. - db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), + "val")); for (int j = 0; j < kNumPerFile; ++j) { - db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"); + ASSERT_OK( + db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val")); } - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_GT(NumTableFilesAtLevel(1), 0); ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2, @@ -373,8 +389,8 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) { if (i > 0) { // delete [95,105) in two files, [295,305) in next two int mid = (j + (1 - j % 2)) * kNumPerFile; - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(mid - 5), Key(mid + 5)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5))); } std::vector values; // Write 100KB (100 values, each 1K) @@ -384,7 +400,7 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) { } // put extra key to trigger flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); if (j < kNumFiles - 1) { // background compaction may happen early for kNumFiles'th file ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); @@ -400,7 +416,7 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) { // oversized L0 (relative to base_level) causes the compaction to run // earlier. ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), {{"disable_auto_compactions", "true"}})); ASSERT_EQ(NumTableFilesAtLevel(0), 0); @@ -433,8 +449,8 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { // insert range deletions [95,105) in two files, [295,305) in next two // to prepare L1 for later manual compaction. int mid = (j + (1 - j % 2)) * kNumPerFile; - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(mid - 5), Key(mid + 5)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(mid - 5), Key(mid + 5))); } std::vector values; // Write 100KB (100 values, each 1K) @@ -444,13 +460,13 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { } // put extra key to trigger flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); if (j < kFilesPerLevel - 1) { // background compaction may happen early for kFilesPerLevel'th file ASSERT_EQ(NumTableFilesAtLevel(0), j + 1); } } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1); } @@ -483,17 +499,17 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) { if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) { // Delete merge operands from all but the last file - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", - "key_"); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); } std::string val; PutFixed64(&val, i); - db_->Merge(WriteOptions(), "key", val); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); // we need to prevent trivial move using Puts so compaction will actually // process the merge operands. - db_->Put(WriteOptions(), "prevent_trivial_move", ""); + ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", "")); if (i > 0 && i % kNumPerFile == 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } @@ -504,7 +520,7 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { PutFixed64(&expected, 45); // 1+2+...+9 ASSERT_EQ(expected, actual); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); expected.clear(); ASSERT_OK(db_->Get(read_opts, "key", &actual)); @@ -550,19 +566,19 @@ TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) { opts.statistics = CreateDBStatistics(); Reopen(opts); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", - "dr10"); // obsolete after compaction - db_->Put(WriteOptions(), "key", "val"); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", + "dr10")); // obsolete after compaction + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2", - "dr20"); // protected by snapshot - db_->Put(WriteOptions(), "key", "val"); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2", + "dr20")); // protected by snapshot + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); + ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_EQ(2, NumTableFilesAtLevel(0)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE)); @@ -603,28 +619,30 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) { bbto.cache_index_and_filter_blocks = true; bbto.block_cache = NewLRUCache(8 << 20); opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(opts); + DestroyAndReopen(opts); // Hold a snapshot so range deletions can't become obsolete during compaction // to bottommost level (i.e., L1). const Snapshot* snapshot = db_->GetSnapshot(); for (int i = 0; i < kNum; ++i) { - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); if (i > 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) { - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } } // Must be > 1 so the first L1 file can be closed before scan finishes - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(NumTableFilesAtLevel(1), 1); std::vector file_numbers = ListTableFiles(env_, dbname_); ReadOptions read_opts; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int expected = kRangeEnd; iter->SeekToFirst(); for (auto file_number : file_numbers) { @@ -647,7 +665,7 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) { TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) { do { DestroyAndReopen(CurrentOptions()); - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); @@ -669,10 +687,10 @@ TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) { opts.memtable_factory.reset(new SpecialSkipListFactory(1)); DestroyAndReopen(opts); - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); - db_->Put(WriteOptions(), "blah", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "blah", "val")); ReadOptions read_opts; std::string value; @@ -683,7 +701,7 @@ TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) { TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) { do { DestroyAndReopen(CurrentOptions()); - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( @@ -706,11 +724,11 @@ TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) { for (int i = 0; i < kNumMergeOps; ++i) { std::string val; PutFixed64(&val, i); - db_->Merge(WriteOptions(), "key", val); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); if (i == kNumMergeOps / 2) { // deletes [0, 5] - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", - "key_"); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); } } @@ -734,16 +752,16 @@ TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) { opts.memtable_factory.reset(new SpecialSkipListFactory(1)); Reopen(opts); - db_->Put(WriteOptions(), "sst_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); ASSERT_OK(db_->Flush(FlushOptions())); - db_->Put(WriteOptions(), "imm_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); - db_->Put(WriteOptions(), "mem_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); @@ -761,20 +779,22 @@ TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) { Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); - Reopen(opts); + DestroyAndReopen(opts); // Write half of the keys before the tombstone and half after the tombstone. // Only covered keys (i.e., within the range and older than the tombstone) // should be deleted. for (int i = 0; i < kNum; ++i) { if (i == kNum / 2) { - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); } ReadOptions read_opts; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int expected = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -794,7 +814,7 @@ TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) { Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); - Reopen(opts); + DestroyAndReopen(opts); const Snapshot* snapshot = nullptr; // Put a snapshot before the range tombstone, verify an iterator using that @@ -802,14 +822,16 @@ TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) { for (int i = 0; i < kNum; ++i) { if (i == kNum / 2) { snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + GetNumericStr(kRangeBegin), + GetNumericStr(kRangeEnd))); } - db_->Put(WriteOptions(), GetNumericStr(i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val")); } ReadOptions read_opts; read_opts.snapshot = snapshot; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int expected = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -828,22 +850,23 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) { opts.memtable_factory.reset(new SpecialSkipListFactory(1)); Reopen(opts); - db_->Put(WriteOptions(), "sst_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); ASSERT_OK(db_->Flush(FlushOptions())); - db_->Put(WriteOptions(), "imm_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); - db_->Put(WriteOptions(), "mem_key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val")); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); ReadOptions read_opts; read_opts.ignore_range_deletions = true; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); int i = 0; std::string expected[] = {"imm_key", "mem_key", "sst_key"}; for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) { @@ -857,7 +880,7 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) { #ifndef ROCKSDB_UBSAN_RUN TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { - db_->Put(WriteOptions(), "key", "val"); + ASSERT_OK(db_->Put(WriteOptions(), "key", "val")); // snapshot prevents key from being deleted during flush const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK( @@ -873,6 +896,7 @@ TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { iter->SeekToFirst(); } ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; if (i == 0) { ASSERT_OK(db_->Flush(FlushOptions())); @@ -882,7 +906,6 @@ TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) { } db_->ReleaseSnapshot(snapshot); } - #endif // !ROCKSDB_UBSAN_RUN TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) { @@ -926,8 +949,8 @@ TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) { ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); - db_->EnableAutoCompaction({db_->DefaultColumnFamily()}); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->ReleaseSnapshot(snapshot); } @@ -949,7 +972,7 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) { for (int i = 0; i < kNumKeys; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(kNumKeys))); for (int i = 0; i < kNumKeys; ++i) { @@ -987,8 +1010,8 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { // snapshot protects range tombstone from dropping due to becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), - Key(2 * kNumFilesPerLevel)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(2 * kNumFilesPerLevel))); Random rnd(301); std::string value = rnd.RandomString(kValueBytes); @@ -997,14 +1020,14 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { ASSERT_OK(Put(Key(j), value)); ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); if (j > 0) { - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(j, NumTableFilesAtLevel(0)); } } // put extra key to trigger final flush ASSERT_OK(Put("", "")); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1)); @@ -1022,7 +1045,7 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { } else if (i == 2) { ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), {{"max_bytes_for_level_base", "10000"}})); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, NumTableFilesAtLevel(1)); } ASSERT_GT(NumTableFilesAtLevel(2), 0); @@ -1056,8 +1079,8 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { // A snapshot protects the range tombstone from dropping due to // becoming obsolete. const Snapshot* snapshot = db_->GetSnapshot(); - db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(0), Key(2 * kNumFilesPerLevel)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(2 * kNumFilesPerLevel))); // Create 2 additional sstables in L0. Note that the first sstable // contains the range tombstone. @@ -1096,7 +1119,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { ASSERT_EQ(value, Get(Key(2))); auto begin_str = Key(3); const ROCKSDB_NAMESPACE::Slice begin = begin_str; - dbfull()->TEST_CompactRange(1, &begin, nullptr); + ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr)); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(2, NumTableFilesAtLevel(2)); ASSERT_EQ(value, Get(Key(2))); @@ -1115,7 +1138,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { // [key000002#6,1, key000004#72057594037927935,15] auto begin_str = Key(0); const ROCKSDB_NAMESPACE::Slice begin = begin_str; - dbfull()->TEST_CompactRange(1, &begin, &begin); + ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); ASSERT_EQ(3, NumTableFilesAtLevel(2)); } @@ -1216,9 +1239,9 @@ TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) { std::string value; ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); - dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */, - nullptr /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); // Now we have multiple files at L1 all containing a single user key, thus // guaranteeing overlap in the file endpoints. @@ -1229,9 +1252,9 @@ TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) { // Compact and verify again. It's worthwhile because now the files have // tighter endpoints, so we can verify that doesn't mess anything up. - dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */, - nullptr /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 1 /* level */, nullptr /* begin */, nullptr /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); ASSERT_GT(NumTableFilesAtLevel(2), 1); ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound()); @@ -1307,6 +1330,7 @@ TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) { auto get_key_count = [this]() -> int { auto* iter = db_->NewIterator(ReadOptions()); + assert(iter->status().ok()); iter->SeekToFirst(); int keys_found = 0; for (; iter->Valid(); iter->Next()) { @@ -1409,6 +1433,7 @@ TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) { ASSERT_GT(NumTableFilesAtLevel(1), 1); auto* iter = db_->NewIterator(ReadOptions()); + ASSERT_OK(iter->status()); iter->SeekToLast(); int keys_found = 0; for (; iter->Valid(); iter->Prev()) { @@ -1435,11 +1460,12 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) { ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(10))); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); ReadOptions read_opts; read_opts.snapshot = snapshot; auto* iter = db_->NewIterator(read_opts); + ASSERT_OK(iter->status()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); @@ -1482,6 +1508,7 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) { ReadOptions read_opts; read_opts.snapshot = snapshot.get(); std::unique_ptr iter(db_->NewIterator(read_opts)); + ASSERT_OK(iter->status()); TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator"); @@ -1519,7 +1546,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { std::string value = rnd.RandomString(kValueBytes); ASSERT_OK(Put(key, value)); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); MoveFilesToLevel(2); } ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -1538,7 +1565,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { // TODO(ajkr): remove this `Put` after file cutting accounts for range // tombstones (#3977). ASSERT_OK(Put("c" + Key(1), "value")); - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone // and the range tombstone is only placed in the second SST. @@ -1546,9 +1573,9 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { Slice begin_key(begin_key_storage); std::string end_key_storage("d"); Slice end_key(end_key_storage); - dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */, - &end_key /* end */, nullptr /* column_family */, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange( + 0 /* level */, &begin_key /* begin */, &end_key /* end */, + nullptr /* column_family */, true /* disallow_trivial_move */)); ASSERT_EQ(2, NumTableFilesAtLevel(1)); std::vector all_metadata; @@ -1613,15 +1640,15 @@ TEST_F(DBRangeDelTest, OverlappedTombstones) { ASSERT_EQ(1, NumTableFilesAtLevel(0)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); // The tombstone range is not broken up into multiple SSTs which may incur a // large compaction with L2. ASSERT_EQ(1, NumTableFilesAtLevel(1)); std::vector> files; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(1, NumTableFilesAtLevel(2)); ASSERT_EQ(0, NumTableFilesAtLevel(1)); } @@ -1654,14 +1681,15 @@ TEST_F(DBRangeDelTest, OverlappedKeys) { // The key range is broken up into three SSTs to avoid a future big compaction // with the grandparent - dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); ASSERT_EQ(3, NumTableFilesAtLevel(1)); - std::vector> files; - dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, - true /* disallow_trivial_move */); - ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ( + 3, NumTableFilesAtLevel( + 2)); // L1->L2 compaction size is limited to max_compaction_bytes ASSERT_EQ(0, NumTableFilesAtLevel(1)); } diff --git a/db/db_impl/db_secondary_test.cc b/db/db_secondary_test.cc similarity index 79% rename from db/db_impl/db_secondary_test.cc rename to db/db_secondary_test.cc index 23dc63aca09..13ec1eec0e0 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -104,7 +104,7 @@ void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir, uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { - log_cnt += (type == kLogFile); + log_cnt += (type == kWalFile); sst_cnt += (type == kTableFile); manifest_cnt += (type == kDescriptorFile); } @@ -147,6 +147,213 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) { ASSERT_EQ(2, count); } +TEST_F(DBSecondaryTest, SimpleInternalCompaction) { + Options options; + options.env = env_; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + Close(); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, + &result)); + + ASSERT_EQ(result.output_files.size(), 1); + InternalKey smallest, largest; + smallest.DecodeFrom(result.output_files[0].smallest_internal_key); + largest.DecodeFrom(result.output_files[0].largest_internal_key); + ASSERT_EQ(smallest.user_key().ToString(), "bar"); + ASSERT_EQ(largest.user_key().ToString(), "foo"); + ASSERT_EQ(result.output_level, 1); + ASSERT_EQ(result.output_path, this->secondary_path_); + ASSERT_EQ(result.num_output_records, 2); + ASSERT_GT(result.bytes_written, 0); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + const int kRangeL2 = 10; + const int kRangeL1 = 30; + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i * kRangeL2), "value" + ToString(i))); + ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + ToString(i))); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + for (int i = 0; i < 5; i++) { + ASSERT_OK(Put(Key(i * kRangeL1), "value" + ToString(i))); + ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + ToString(i))); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(1); + for (int i = 0; i < 4; i++) { + ASSERT_OK(Put(Key(i * 30), "value" + ToString(i))); + ASSERT_OK(Put(Key(i * 30 + 50), "value" + ToString(i))); + ASSERT_OK(Flush()); + } + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + + // pick 2 files on level 0 for compaction, which has 3 overlap files on L1 + CompactionServiceInput input1; + input1.input_files.push_back(meta.levels[0].files[2].name); + input1.input_files.push_back(meta.levels[0].files[3].name); + input1.input_files.push_back(meta.levels[1].files[0].name); + input1.input_files.push_back(meta.levels[1].files[1].name); + input1.input_files.push_back(meta.levels[1].files[2].name); + + input1.output_level = 1; + + options.max_open_files = -1; + Close(); + + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + CompactionServiceResult result; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input1, + &result)); + ASSERT_OK(result.status); + + // pick 2 files on level 1 for compaction, which has 6 overlap files on L2 + CompactionServiceInput input2; + input2.input_files.push_back(meta.levels[1].files[1].name); + input2.input_files.push_back(meta.levels[1].files[2].name); + for (int i = 3; i < 9; i++) { + input2.input_files.push_back(meta.levels[2].files[i].name); + } + + input2.output_level = 2; + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2, + &result)); + ASSERT_OK(result.status); + + CloseSecondary(); + + // delete all l2 files, without update manifest + for (auto& file : meta.levels[2].files) { + ASSERT_OK(env_->DeleteFile(dbname_ + file.name)); + } + OpenSecondary(options); + cfh = db_secondary_->DefaultColumnFamily(); + Status s = db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2, + &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); + + // TODO: L0 -> L1 compaction should success, currently version is not built + // if files is missing. + // ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, + // input1, &result)); +} + +TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + + // trigger compaction to delete the files for secondary instance compaction + ASSERT_OK(Put("foo", "foo_value" + std::to_string(3))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(3))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Close(); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + Status s = + db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); +} + +TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) { + Options options; + options.env = env_; + options.level0_file_num_compaction_trigger = 4; + Reopen(options); + for (int i = 0; i < 3; ++i) { + ASSERT_OK(Put("foo", "foo_value" + std::to_string(i))); + ASSERT_OK(Put("bar", "bar_value" + std::to_string(i))); + ASSERT_OK(Flush()); + } + CompactionServiceInput input; + + ColumnFamilyMetaData meta; + db_->GetColumnFamilyMetaData(&meta); + for (auto& file : meta.levels[0].files) { + ASSERT_EQ(0, meta.levels[0].level); + input.input_files.push_back(file.name); + } + ASSERT_EQ(input.input_files.size(), 3); + + input.output_level = 1; + + Close(); + + ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0])); + + options.max_open_files = -1; + OpenSecondary(options); + auto cfh = db_secondary_->DefaultColumnFamily(); + + CompactionServiceResult result; + Status s = + db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_OK(result.status); + + input.input_files.erase(input.input_files.begin()); + + ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, + &result)); + ASSERT_OK(result.status); +} + TEST_F(DBSecondaryTest, OpenAsSecondary) { Options options; options.env = env_; @@ -459,20 +666,6 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { } TEST_F(DBSecondaryTest, MissingTableFile) { - int table_files_not_exist = 0; - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->SetCallBack( - "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers", - [&](void* arg) { - Status s = *reinterpret_cast(arg); - if (s.IsPathNotFound()) { - ++table_files_not_exist; - } else if (!s.ok()) { - assert(false); // Should not reach here - } - }); - SyncPoint::GetInstance()->EnableProcessing(); Options options; options.env = env_; options.level0_file_num_compaction_trigger = 4; @@ -499,7 +692,6 @@ TEST_F(DBSecondaryTest, MissingTableFile) { ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value)); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); - ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist); ASSERT_OK(db_secondary_->Get(ropts, "foo", &value)); ASSERT_EQ("foo_value" + std::to_string(options.level0_file_num_compaction_trigger - 1), @@ -615,10 +807,7 @@ TEST_F(DBSecondaryTest, SwitchManifest) { range_scan_db(); } -// Here, "Snapshot" refers to the version edits written by -// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after -// switching from the old one. -TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { +TEST_F(DBSecondaryTest, SwitchManifestTwice) { Options options; options.env = env_; options.disable_auto_compactions = true; @@ -640,10 +829,15 @@ TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { Reopen(options); ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + Reopen(options); + ASSERT_OK(Put("0", "value1")); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value1", value); } -TEST_F(DBSecondaryTest, SwitchWAL) { +TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) { const int kNumKeysPerMemtable = 1; Options options; options.env = env_; @@ -692,7 +886,7 @@ TEST_F(DBSecondaryTest, SwitchWAL) { } } -TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { +TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) { const int kNumKeysPerMemtable = 1; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->LoadDependency( @@ -748,10 +942,12 @@ TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { } }; for (int k = 0; k != 8; ++k) { - ASSERT_OK( - Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); - ASSERT_OK( - Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + for (int j = 0; j < 2; ++j) { + ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k), + "value" + std::to_string(k))); + ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k), + "value" + std::to_string(k))); + } TEST_SYNC_POINT( "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); @@ -883,6 +1079,7 @@ TEST_F(DBSecondaryTest, StartFromInconsistent) { }); SyncPoint::GetInstance()->EnableProcessing(); Options options1; + options1.env = env_; Status s = TryOpenSecondary(options1); ASSERT_TRUE(s.IsCorruption()); } @@ -894,6 +1091,7 @@ TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) { ASSERT_OK(Flush()); Options options1; + options1.env = env_; OpenSecondary(options1); { diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 2287c2425ae..640b4f5a2cd 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -98,7 +98,7 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) { for (int i = 0; i < 10; ++i) { GenerateNewFile(&rnd, &key_id, false); } - Flush(); + ASSERT_OK(Flush()); Close(); int const num_files = GetSstFileCount(dbname_); ASSERT_GT(num_files, 0); @@ -141,6 +141,7 @@ TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) { // Just open the DB with the option set to true and check that we don't crash. Options options; + options.env = env_; options.skip_checking_sst_file_sizes_on_db_open = true; Reopen(options); @@ -169,7 +170,7 @@ TEST_F(DBSSTTest, DontDeleteMovedFile) { ASSERT_OK(Flush()); } // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1", FilesPerLevel(0)); // If the moved file is actually deleted (the move-safeguard in @@ -217,7 +218,7 @@ TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { ASSERT_OK(Flush()); } // this should execute both L0->L1 and L1->(move)->L2 compactions - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1", FilesPerLevel(0)); test::SleepingBackgroundTask blocking_thread; @@ -263,9 +264,9 @@ TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) { // finish the flush! blocking_thread.WakeUp(); blocking_thread.WaitUntilDone(); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // File just flushed is too big for L0 and L1 so gets moved to L2. - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0)); metadata.clear(); @@ -301,17 +302,17 @@ TEST_F(DBSSTTest, DBWithSstFileManager) { for (int i = 0; i < 25; i++) { GenerateNewRandomFile(&rnd); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Verify that we are tracking all sst files in dbname_ std::unordered_map files_in_db; - ASSERT_OK(GetAllSSTFiles(&files_in_db)); + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); } ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); std::unordered_map files_in_db; - ASSERT_OK(GetAllSSTFiles(&files_in_db)); + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); // Verify that we are tracking all sst files in dbname_ ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); // Verify the total files size @@ -345,6 +346,262 @@ TEST_F(DBSSTTest, DBWithSstFileManager) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + DestroyAndReopen(options); + Random rnd(301); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + std::vector blob_files = GetBlobFileNumbers(); + ASSERT_EQ(files_added, blob_files.size()); + // No blob file is obsoleted. + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + // No files were moved. + ASSERT_EQ(files_moved, 0); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + + // Verify that we are tracking all sst and blob files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + Close(); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened. + Close(); + + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Destroy DB and it will remove all the blob files from sst file manager and + // blob files deletion will go through ScheduleFileDeletion. + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_EQ(files_deleted, blob_files.size()); + ASSERT_EQ(files_scheduled_to_delete, blob_files.size()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.enable_blob_files = true; + options.blob_file_size = 32; // create one blob per file + options.disable_auto_compactions = true; + options.enable_blob_garbage_collection = true; + options.blob_garbage_collection_age_cutoff = 0.5; + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + int files_scheduled_to_delete = 0; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_added++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + files_deleted++; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) { + assert(arg); + const std::string* const file_path = + static_cast(arg); + if (file_path->find(".blob") != std::string::npos) { + ++files_scheduled_to_delete; + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + Random rnd(301); + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "first_value"; + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "second_value"; + + ASSERT_OK(Put(first_key, first_value)); + ASSERT_OK(Put(second_key, second_value)); + ASSERT_OK(Flush()); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "third_value"; + constexpr char fourth_key[] = "fourth_key"; + constexpr char fourth_value[] = "fourth_value"; + constexpr char fifth_key[] = "fifth_key"; + constexpr char fifth_value[] = "fifth_value"; + + ASSERT_OK(Put(third_key, third_value)); + ASSERT_OK(Put(fourth_key, fourth_value)); + ASSERT_OK(Put(fifth_key, fifth_value)); + ASSERT_OK(Flush()); + + const std::vector original_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(original_blob_files.size(), 5); + ASSERT_EQ(files_added, 5); + ASSERT_EQ(files_deleted, 0); + ASSERT_EQ(files_scheduled_to_delete, 0); + ASSERT_EQ(files_moved, 0); + { + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + const size_t cutoff_index = static_cast( + options.blob_garbage_collection_age_cutoff * original_blob_files.size()); + + size_t expected_number_of_files = original_blob_files.size(); + // Note: turning off enable_blob_files before the compaction results in + // garbage collected values getting inlined. + ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}})); + expected_number_of_files -= cutoff_index; + files_added = 0; + + constexpr Slice* begin = nullptr; + constexpr Slice* end = nullptr; + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end)); + sfm->WaitForEmptyTrash(); + + ASSERT_EQ(Get(first_key), first_value); + ASSERT_EQ(Get(second_key), second_value); + ASSERT_EQ(Get(third_key), third_value); + ASSERT_EQ(Get(fourth_key), fourth_value); + ASSERT_EQ(Get(fifth_key), fifth_value); + + const std::vector new_blob_files = GetBlobFileNumbers(); + + ASSERT_EQ(new_blob_files.size(), expected_number_of_files); + // No new file is added. + ASSERT_EQ(files_added, 0); + ASSERT_EQ(files_deleted, cutoff_index); + ASSERT_EQ(files_scheduled_to_delete, cutoff_index); + ASSERT_EQ(files_moved, 0); + + // Original blob files below the cutoff should be gone, original blob files at + // or above the cutoff should be still there + for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) { + ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]); + } + + { + // Verify that we are tracking all sst and blob files in dbname_ + std::unordered_map files_in_db; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db)); + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db)); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + } + + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); + sfm->WaitForEmptyTrash(); + ASSERT_EQ(files_deleted, 5); + ASSERT_EQ(files_scheduled_to_delete, 5); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + TEST_F(DBSSTTest, RateLimitedDelete) { Destroy(last_options_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ @@ -373,6 +630,14 @@ TEST_F(DBSSTTest, RateLimitedDelete) { *abs_time_us = Env::Default()->NowMicros(); }); + // Disable PeriodicWorkScheduler as it also has TimedWait, which could update + // the simulated sleep time + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::StartPeriodicWorkScheduler:DisableScheduler", [&](void* arg) { + bool* disable_scheduler = static_cast(arg); + *disable_scheduler = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); @@ -392,7 +657,7 @@ TEST_F(DBSSTTest, RateLimitedDelete) { WriteOptions wo; wo.disableWAL = true; - ASSERT_OK(TryReopen(options)); + Reopen(options); // Create 4 files in L0 for (char v = 'a'; v <= 'd'; v++) { ASSERT_OK(Put("Key2", DummyString(1024, v), wo)); @@ -488,10 +753,11 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { } class DBWALTestWithParam - : public DBSSTTest, + : public DBTestBase, public testing::WithParamInterface> { public: - DBWALTestWithParam() { + explicit DBWALTestWithParam() + : DBTestBase("/db_wal_test_with_params", /*env_do_fsync=*/true) { wal_dir_ = std::get<0>(GetParam()); wal_dir_same_as_dbname_ = std::get<1>(GetParam()); } @@ -519,7 +785,7 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { bool fake_log_delete; }; - std::unique_ptr env(new MyEnv(Env::Default())); + std::unique_ptr env(new MyEnv(env_)); Destroy(last_options_); env->set_fake_log_delete(true); @@ -539,10 +805,15 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { auto sfm = static_cast(options.sst_file_manager.get()); sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); - ASSERT_OK(TryReopen(options)); + Reopen(options); // Create 4 files in L0 for (char v = 'a'; v <= 'd'; v++) { + if (v == 'd') { + // Maximize the change that the last log file will be preserved in trash + // before restarting the DB. + options.sst_file_manager->SetDeleteRateBytesPerSecond(1); + } ASSERT_OK(Put("Key2", DummyString(1024, v))); ASSERT_OK(Put("Key3", DummyString(1024, v))); ASSERT_OK(Put("Key4", DummyString(1024, v))); @@ -561,11 +832,11 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { if (!wal_dir_same_as_dbname_) { // Forcibly create some trash log files std::unique_ptr result; - env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, - EnvOptions()); + ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, + EnvOptions())); result.reset(); } - env->GetChildren(options.wal_dir, &filenames); + ASSERT_OK(env->GetChildren(options.wal_dir, &filenames)); for (const std::string& fname : filenames) { if (fname.find(".log.trash") != std::string::npos) { trash_log_count++; @@ -574,11 +845,11 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { ASSERT_GE(trash_log_count, 1); env->set_fake_log_delete(false); - ASSERT_OK(TryReopen(options)); + Reopen(options); filenames.clear(); trash_log_count = 0; - env->GetChildren(options.wal_dir, &filenames); + ASSERT_OK(env->GetChildren(options.wal_dir, &filenames)); for (const std::string& fname : filenames) { if (fname.find(".log.trash") != std::string::npos) { trash_log_count++; @@ -602,13 +873,13 @@ TEST_F(DBSSTTest, OpenDBWithExistingTrash) { Destroy(last_options_); // Add some trash files to the db directory so the DB can clean them up - env_->CreateDirIfMissing(dbname_); + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); // Reopen the DB and verify that it deletes existing trash files - ASSERT_OK(TryReopen(options)); + Reopen(options); sfm->WaitForEmptyTrash(); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash")); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash")); @@ -727,7 +998,7 @@ TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) { int num_sst_files = 0; int num_wal_files = 0; std::vector db_files; - env_->GetChildren(dbname_, &db_files); + ASSERT_OK(env_->GetChildren(dbname_, &db_files)); for (std::string f : db_files) { if (f.substr(f.find_last_of(".") + 1) == "sst") { num_sst_files++; @@ -766,7 +1037,7 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { uint64_t first_file_size = 0; std::unordered_map files_in_db; - ASSERT_OK(GetAllSSTFiles(&files_in_db, &first_file_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size)); ASSERT_EQ(sfm->GetTotalSize(), first_file_size); // Set the maximum allowed space usage to the current total size @@ -777,6 +1048,68 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) { ASSERT_NOK(Flush()); } +TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + options.enable_blob_files = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(50))); + } + ASSERT_OK(Flush()); + + uint64_t files_size = 0; + uint64_t total_files_size = 0; + std::unordered_map files_in_db; + + ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size)); + // Make sure blob files are considered by SSTFileManage in size limits. + ASSERT_GT(files_size, 0); + total_files_size = files_size; + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size)); + total_files_size += files_size; + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Set the maximum allowed space usage to the current total size. + sfm->SetMaxAllowedSpaceUsage(files_size + 1); + + bool max_allowed_space_reached = false; + bool delete_blob_file = false; + // Sync point called after blob file is closed and max allowed space is + // checked. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached", + [&](void* /*arg*/) { max_allowed_space_reached = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "BuildTable::AfterDeleteFile", + [&](void* /*arg*/) { delete_blob_file = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + { + "BuildTable::AfterDeleteFile", + "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1", + }, + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("key1", "val1")); + // This flush will fail + ASSERT_NOK(Flush()); + ASSERT_TRUE(max_allowed_space_reached); + + TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1"); + ASSERT_TRUE(delete_blob_file); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBSSTTest, CancellingCompactionsWorks) { std::shared_ptr sst_file_manager(NewSstFileManager(env_)); auto sfm = static_cast(sst_file_manager.get()); @@ -807,7 +1140,7 @@ TEST_F(DBSSTTest, CancellingCompactionsWorks) { ASSERT_OK(Flush()); uint64_t total_file_size = 0; std::unordered_map files_in_db; - ASSERT_OK(GetAllSSTFiles(&files_in_db, &total_file_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size)); // Set the maximum allowed space usage to the current total size sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); @@ -816,7 +1149,7 @@ TEST_F(DBSSTTest, CancellingCompactionsWorks) { ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); // Because we set a callback in CancelledCompaction, we actually // let the compaction run @@ -855,7 +1188,7 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { ASSERT_OK(Flush()); uint64_t total_file_size = 0; std::unordered_map files_in_db; - ASSERT_OK(GetAllSSTFiles(&files_in_db, &total_file_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size)); // Set the maximum allowed space usage to the current total size sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1); @@ -866,10 +1199,12 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { ASSERT_OK(Flush()); // OK, now trigger a manual compaction - dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(dbfull() + ->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsCompactionTooLarge()); // Wait for manual compaction to get scheduled and finish - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); // Make sure the stat is bumped @@ -879,10 +1214,13 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { // Now make sure CompactFiles also gets cancelled auto l0_files = collector->GetFlushedFiles(); - dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0); + ASSERT_TRUE( + dbfull() + ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0) + .IsCompactionTooLarge()); // Wait for manual compaction to get scheduled and finish - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( COMPACTION_CANCELLED), @@ -897,8 +1235,9 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0); - dbfull()->TEST_WaitForCompact(true); + ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), + l0_files, 0)); + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); ASSERT_GT(completed_compactions, 0); @@ -966,7 +1305,7 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) { ASSERT_TRUE(bg_error_set); uint64_t total_sst_files_size = 0; std::unordered_map files_in_db; - ASSERT_OK(GetAllSSTFiles(&files_in_db, &total_sst_files_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size)); ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -1002,7 +1341,7 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 2; - db_->CompactRange(compact_options, nullptr, nullptr); + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); // Create 12 Files in L0 for (int i = 0; i < 12; i++) { @@ -1054,7 +1393,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) { std::string val = "val_file_" + ToString(i); ASSERT_OK(Put(Key(j), val)); } - Flush(); + ASSERT_OK(Flush()); } ASSERT_EQ("5", FilesPerLevel(0)); @@ -1078,6 +1417,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) { // hold current version std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter1->status()); // Compact 5 files into 1 file in L0 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -1101,12 +1441,13 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) { // hold current version std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter2->status()); // Delete all keys and compact, this will delete all live files for (int i = 0; i < 10; i++) { ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("", FilesPerLevel(0)); @@ -1120,6 +1461,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) { // Total SST files = 6 (5 original files + compacted file) ASSERT_EQ(total_sst_files_size, 6 * single_file_size); + ASSERT_OK(iter1->status()); iter1.reset(); ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", &total_sst_files_size)); @@ -1127,6 +1469,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) { // Total SST files = 1 (compacted file) ASSERT_EQ(total_sst_files_size, 1 * single_file_size); + ASSERT_OK(iter2->status()); iter2.reset(); ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", &total_sst_files_size)); @@ -1145,7 +1488,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { // Generate 5 files in L0 for (int i = 0; i < 5; i++) { ASSERT_OK(Put(Key(i), "val")); - Flush(); + ASSERT_OK(Flush()); } ASSERT_EQ("5", FilesPerLevel(0)); @@ -1170,6 +1513,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { // hold current version std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter1->status()); // Compaction will do trivial move from L0 to L1 ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -1193,12 +1537,13 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { // hold current version std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + ASSERT_OK(iter2->status()); // Delete all keys and compact, this will delete all live files for (int i = 0; i < 5; i++) { ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ("", FilesPerLevel(0)); @@ -1212,7 +1557,9 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) { // Total SST files = 5 (used in 2 version) ASSERT_EQ(total_sst_files_size, 5 * single_file_size); + ASSERT_OK(iter1->status()); iter1.reset(); + ASSERT_OK(iter2->status()); iter2.reset(); ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size", diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index d4e4c628b1d..0874eb40bbc 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -137,7 +137,7 @@ TEST_F(DBStatisticsTest, ResetStats) { ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN)); options.statistics->histogramData(DB_WRITE, &histogram_data); ASSERT_GT(histogram_data.max, 0.0); - options.statistics->Reset(); + ASSERT_OK(options.statistics->Reset()); } } } diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index 35ebb6ec3af..96cbe9f1a69 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -11,9 +11,11 @@ #include #include "db/db_test_util.h" +#include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/db.h" #include "rocksdb/utilities/table_properties_collectors.h" +#include "table/format.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/random.h" @@ -50,7 +52,7 @@ class DBTablePropertiesTest : public DBTestBase, public testing::WithParamInterface { public: DBTablePropertiesTest() - : DBTestBase("/db_table_properties_test", /*env_do_fsync=*/true) {} + : DBTestBase("/db_table_properties_test", /*env_do_fsync=*/false) {} TablePropertiesCollection TestGetPropertiesOfTablesInRange( std::vector ranges, std::size_t* num_properties = nullptr, std::size_t* num_files = nullptr); @@ -63,9 +65,9 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) { // Create 4 tables for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + ASSERT_OK(db_->Put(WriteOptions(), ToString(table * 100 + i), "val")); } - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); } // 1. Read table properties directly from file @@ -159,14 +161,14 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { for (int i = 0; i < 10000; i++) { ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); } - Flush(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (NumTableFilesAtLevel(0) == 0) { ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102))); - Flush(); + ASSERT_OK(Flush()); } - db_->PauseBackgroundWork(); + ASSERT_OK(db_->PauseBackgroundWork()); // Ensure that we have at least L0, L1 and L2 ASSERT_GT(NumTableFilesAtLevel(0), 0); @@ -234,8 +236,8 @@ TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) { // Create one table per CF, then verify it was created with the column family // name property. for (uint32_t cf = 0; cf < 2; ++cf) { - Put(cf, "key", "val"); - Flush(cf); + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Flush(cf)); TablePropertiesCollection fname_to_props; ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); @@ -258,22 +260,71 @@ TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) { CreateAndReopenWithCF({"goku"}, CurrentOptions()); for (uint32_t cf = 0; cf < 2; ++cf) { - Put(cf, "key", "val"); - Put(cf, "foo", "bar"); - Flush(cf); + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Put(cf, "foo", "bar")); + ASSERT_OK(Flush(cf)); TablePropertiesCollection fname_to_props; ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); ASSERT_EQ(1U, fname_to_props.size()); std::string id, sid; - db_->GetDbIdentity(id); - db_->GetDbSessionId(sid); + ASSERT_OK(db_->GetDbIdentity(id)); + ASSERT_OK(db_->GetDbSessionId(sid)); ASSERT_EQ(id, fname_to_props.begin()->second->db_id); ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id); } } +class DBTableHostnamePropertyTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + DBTableHostnamePropertyTest() + : DBTestBase("/db_table_hostname_property_test", + /*env_do_fsync=*/false) {} +}; + +TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) { + option_config_ = std::get<0>(GetParam()); + Options opts = CurrentOptions(); + std::string expected_host_id = std::get<1>(GetParam()); + ; + if (expected_host_id == kHostnameForDbHostId) { + ASSERT_OK(env_->GetHostNameString(&expected_host_id)); + } else { + opts.db_host_id = expected_host_id; + } + CreateAndReopenWithCF({"goku"}, opts); + + for (uint32_t cf = 0; cf < 2; ++cf) { + ASSERT_OK(Put(cf, "key", "val")); + ASSERT_OK(Put(cf, "foo", "bar")); + ASSERT_OK(Flush(cf)); + + TablePropertiesCollection fname_to_props; + ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props)); + ASSERT_EQ(1U, fname_to_props.size()); + + ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id); + } +} + +INSTANTIATE_TEST_CASE_P( + DBTableHostnamePropertyTest, DBTableHostnamePropertyTest, + ::testing::Values( + // OptionConfig, override db_host_location + std::make_tuple(DBTestBase::OptionConfig::kDefault, + kHostnameForDbHostId), + std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"), + std::make_tuple(DBTestBase::OptionConfig::kDefault, ""), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + kHostnameForDbHostId), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + "foobar"), + std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix, + ""))); + class DeletionTriggeredCompactionTestListener : public EventListener { public: void OnCompactionBegin(DB* , const CompactionJobInfo& ci) override { @@ -305,8 +356,8 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { // add an L1 file to prevent tombstones from dropping due to obsolescence // during flush - Put(Key(0), "val"); - Flush(); + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); MoveFilesToLevel(1); DeletionTriggeredCompactionTestListener *listener = @@ -317,14 +368,14 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); // Change the window size and deletion trigger and ensure new values take @@ -338,14 +389,14 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); // Change the window size to disable delete triggered compaction @@ -357,14 +408,14 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { - Delete(Key(i)); + ASSERT_OK(Delete(Key(i))); } else { - Put(Key(i), "val"); + ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, NumTableFilesAtLevel(0)); ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED)); ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED)); @@ -387,8 +438,8 @@ TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) { // Add an L2 file to prevent tombstones from dropping due to obsolescence // during flush - Put(Key(0), "val"); - Flush(); + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); MoveFilesToLevel(2); auto* listener = new DeletionTriggeredCompactionTestListener(); diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index f33b7cb13a5..d77168d9648 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -31,6 +31,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorSingle) { std::unique_ptr iter(db_->NewIterator(read_options)); iter->SeekToFirst(); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); // add a record and check that iter can see it ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor")); @@ -48,6 +49,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorKeepAdding) { read_options.tailing = true; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); std::string value(1024, 'a'); const int num_records = 10000; @@ -70,7 +72,9 @@ TEST_F(DBTestTailingIterator, TailingIteratorSeekToNext) { read_options.tailing = true; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); std::string value(1024, 'a'); const int num_records = 1000; @@ -138,8 +142,11 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { Slice keyu(bufe, 20); read_options.iterate_upper_bound = &keyu; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); std::unique_ptr iterh(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iterh->status()); std::string value(1024, 'a'); bool file_iters_deleted = false; bool file_iters_renewed_null = false; @@ -179,7 +186,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { if (i % 100 == 99) { ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (i == 299) { file_iters_deleted = true; } @@ -225,6 +232,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { ReopenWithColumnFamilies({"default", "pikachu"}, options); read_options.read_tier = kBlockCacheTier; std::unique_ptr iteri(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iteri->status()); char buf5[32]; snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2); Slice target1(buf5, 20); @@ -236,6 +244,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { options.table_factory.reset(NewBlockBasedTableFactory()); ReopenWithColumnFamilies({"default", "pikachu"}, options); iter.reset(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); for (int i = 2 * num_records; i > 0; --i) { char buf1[32]; char buf2[32]; @@ -262,6 +271,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorDeletes) { read_options.tailing = true; std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); // write a single record, read it using the iterator, then delete it ASSERT_OK(Put(1, "0test", "test")); @@ -309,6 +319,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorPrefixSeek) { CreateAndReopenWithCF({"pikachu"}, options); std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); ASSERT_OK(Put(1, "0101", "test")); ASSERT_OK(Flush(1)); @@ -339,6 +350,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorIncomplete) { ASSERT_OK(db_->Put(WriteOptions(), key, value)); std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); iter->SeekToFirst(); // we either see the entry or it's not in cache ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); @@ -369,6 +381,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorSeekToSame) { } std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); // Seek to 00001. We expect to find 00002. std::string start_key = "00001"; iter->Seek(start_key); @@ -404,6 +417,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) { ASSERT_OK(Put(1, "21", "21")); std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(it->status()); it->Seek("12"); ASSERT_TRUE(it->Valid()); ASSERT_EQ("12", it->key().ToString()); @@ -411,7 +425,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) { it->Next(); // Not valid since "21" is over the upper bound. ASSERT_FALSE(it->Valid()); - + ASSERT_OK(it->status()); // This keeps track of the number of times NeedToSeekImmutable() was true. int immutable_seeks = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -424,6 +438,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); ASSERT_EQ(0, immutable_seeks); } @@ -478,6 +493,8 @@ TEST_F(DBTestTailingIterator, TailingIteratorGap) { it->Next(); ASSERT_TRUE(it->Valid()); ASSERT_EQ("40", it->key().ToString()); + + ASSERT_OK(it->status()); } TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) { @@ -496,6 +513,7 @@ TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) { ASSERT_OK(Flush()); std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); iter->Seek("aa"); ASSERT_TRUE(iter->Valid()); @@ -518,6 +536,7 @@ TEST_F(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) { ASSERT_OK(Flush()); std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); diff --git a/db/db_test.cc b/db/db_test.cc index 948036cfce2..282631414a8 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -247,17 +247,21 @@ TEST_F(DBTest, SkipDelay) { wo.sync = sync; wo.disableWAL = disableWAL; wo.no_slowdown = true; - dbfull()->Put(wo, "foo", "bar"); + // Large enough to exceed allowance for one time interval + std::string large_value(1024, 'x'); + // Perhaps ideally this first write would fail because of delay, but + // the current implementation does not guarantee that. + dbfull()->Put(wo, "foo", large_value).PermitUncheckedError(); // We need the 2nd write to trigger delay. This is because delay is // estimated based on the last write size which is 0 for the first write. - ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2")); + ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value)); ASSERT_GE(sleep_count.load(), 0); ASSERT_GE(wait_count.load(), 0); token.reset(); - token = dbfull()->TEST_write_controler().GetDelayToken(1000000000); + token = dbfull()->TEST_write_controler().GetDelayToken(1000000); wo.no_slowdown = false; - ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3")); + ASSERT_OK(dbfull()->Put(wo, "foo3", large_value)); ASSERT_GE(sleep_count.load(), 1); token.reset(); } @@ -906,6 +910,9 @@ TEST_F(DBTest, FlushSchedule) { static_cast(options.write_buffer_size); options.max_write_buffer_number = 2; options.write_buffer_size = 120 * 1024; + auto flush_listener = std::make_shared(); + flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull; + options.listeners.push_back(flush_listener); CreateAndReopenWithCF({"pikachu"}, options); std::vector threads; @@ -1307,51 +1314,6 @@ TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) { } #endif // ROCKSDB_LITE -TEST_F(DBTest, SparseMerge) { - do { - Options options = CurrentOptions(); - options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, options); - - FillLevels("A", "Z", 1); - - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put(1, "A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(1, key, value); - } - Put(1, "C", "vc"); - ASSERT_OK(Flush(1)); - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - - // Make sparse update - Put(1, "A", "va2"); - Put(1, "B100", "bvalue2"); - Put(1, "C", "vc2"); - ASSERT_OK(Flush(1)); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - dbfull()->TEST_CompactRange(1, nullptr, nullptr); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]), - 20 * 1048576); - } while (ChangeCompactOptions()); -} - #ifndef ROCKSDB_LITE static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); @@ -1384,17 +1346,19 @@ TEST_F(DBTest, ApproximateSizesMemTable) { SizeApproximationOptions size_approx_options; size_approx_options.include_memtabtles = true; size_approx_options.include_files = true; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 6000); ASSERT_LT(size, 204800); // Zero if not including mem table - db_->GetApproximateSizes(&r, 1, &size); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); ASSERT_EQ(size, 0); start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); for (int i = 0; i < N; i++) { @@ -1404,13 +1368,15 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); start = Key(100); end = Key(1020); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 6000); options.max_write_buffer_number = 8; @@ -1436,29 +1402,32 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(100); end = Key(300); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 6000); start = Key(2100); end = Key(2300); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, - &size_with_mt); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt)); ASSERT_GT(size_with_mt, 6000); - db_->GetApproximateSizes(&r, 1, &size_without_mt); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt)); ASSERT_EQ(size_without_mt, 0); Flush(); @@ -1470,15 +1439,16 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, - &size_with_mt); - db_->GetApproximateSizes(&r, 1, &size_without_mt); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt)); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt)); ASSERT_GT(size_with_mt, size_without_mt); ASSERT_GT(size_without_mt, 6000); // Check that include_memtabtles flag works as expected size_approx_options.include_memtabtles = false; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_EQ(size, size_without_mt); // Check that files_size_error_margin works as expected, when the heuristic @@ -1487,10 +1457,12 @@ TEST_F(DBTest, ApproximateSizesMemTable) { end = Key(1000 + N - 2); r = Range(start, end); size_approx_options.files_size_error_margin = -1.0; // disabled - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); uint64_t size2; size_approx_options.files_size_error_margin = 0.5; // enabled, but not used - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2)); ASSERT_EQ(size, size2); } @@ -1541,14 +1513,16 @@ TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { // Get the precise size without any approximation heuristic uint64_t size; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size)); ASSERT_NE(size, 0); // Get the size with an approximation heuristic uint64_t size2; const double error_margin = 0.2; size_approx_options.files_size_error_margin = error_margin; - db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size2)); ASSERT_LT(size2, size * (1 + error_margin)); ASSERT_GT(size2, size * (1 - error_margin)); } @@ -1564,7 +1538,7 @@ TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { const std::string end = Key(i + 11); // overlap by 1 key const Range r(start, end); uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); ASSERT_LE(size, 11 * 100); } } @@ -1632,9 +1606,12 @@ TEST_F(DBTest, ApproximateSizes) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + uint64_t size; + ASSERT_OK(Size("", "xyz", 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); + ASSERT_OK(Size("", "xyz", 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); // Write 8MB (80 values, each 100K) ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); @@ -1647,7 +1624,8 @@ TEST_F(DBTest, ApproximateSizes) { } // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0)); + ASSERT_OK(Size("", Key(50), 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { @@ -1655,14 +1633,17 @@ TEST_F(DBTest, ApproximateSizes) { for (int compact_start = 0; compact_start < N; compact_start += 10) { for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i)); - ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1), - S2 * (i + 1))); - ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10)); + ASSERT_OK(Size("", Key(i), 1, &size)); + ASSERT_TRUE(Between(size, S1 * i, S2 * i)); + ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size)); + ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1))); + ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size)); + ASSERT_TRUE(Between(size, S1 * 10, S2 * 10)); } - ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50)); - ASSERT_TRUE( - Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50)); + ASSERT_OK(Size("", Key(50), 1, &size)); + ASSERT_TRUE(Between(size, S1 * 50, S2 * 50)); + ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size)); + ASSERT_TRUE(Between(size, S1 * 50, S2 * 50)); std::string cstart_str = Key(compact_start); std::string cend_str = Key(compact_start + 9); @@ -1697,21 +1678,32 @@ TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000))); // Check sizes across recovery by reopening a few times + uint64_t size; for (int run = 0; run < 3; run++) { ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 232000)); - ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 242000)); + ASSERT_OK(Size("", Key(0), 1, &size)); + ASSERT_TRUE(Between(size, 0, 0)); + ASSERT_OK(Size("", Key(1), 1, &size)); + ASSERT_TRUE(Between(size, 10000, 11000)); + ASSERT_OK(Size("", Key(2), 1, &size)); + ASSERT_TRUE(Between(size, 20000, 21000)); + ASSERT_OK(Size("", Key(3), 1, &size)); + ASSERT_TRUE(Between(size, 120000, 121000)); + ASSERT_OK(Size("", Key(4), 1, &size)); + ASSERT_TRUE(Between(size, 130000, 131000)); + ASSERT_OK(Size("", Key(5), 1, &size)); + ASSERT_TRUE(Between(size, 230000, 232000)); + ASSERT_OK(Size("", Key(6), 1, &size)); + ASSERT_TRUE(Between(size, 240000, 242000)); // Ensure some overhead is accounted for, even without including all - ASSERT_TRUE(Between(Size("", Key(7), 1), 540500, 545000)); - ASSERT_TRUE(Between(Size("", Key(8), 1), 550500, 555000)); + ASSERT_OK(Size("", Key(7), 1, &size)); + ASSERT_TRUE(Between(size, 540500, 545000)); + ASSERT_OK(Size("", Key(8), 1, &size)); + ASSERT_TRUE(Between(size, 550500, 555000)); - ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110100, 111000)); + ASSERT_OK(Size(Key(3), Key(5), 1, &size)); + ASSERT_TRUE(Between(size, 110100, 111000)); dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); } @@ -1795,6 +1787,7 @@ TEST_F(DBTest, Snapshot) { TEST_F(DBTest, HiddenValuesAreRemoved) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; + uint64_t size; do { Options options = CurrentOptions(options_override); CreateAndReopenWithCF({"pikachu"}, options); @@ -1812,7 +1805,8 @@ TEST_F(DBTest, HiddenValuesAreRemoved) { ASSERT_GT(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(big, Get(1, "foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000)); + ASSERT_OK(Size("", "pastfoo", 1, &size)); + ASSERT_TRUE(Between(size, 50000, 60000)); db_->ReleaseSnapshot(snapshot); ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]"); Slice x("x"); @@ -1823,7 +1817,8 @@ TEST_F(DBTest, HiddenValuesAreRemoved) { dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]); ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]"); - ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000)); + ASSERT_OK(Size("", "pastfoo", 1, &size)); + ASSERT_TRUE(Between(size, 0, 1000)); // ApproximateOffsetOf() is not yet implemented in plain table format, // which is used by Size(). } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | @@ -2346,6 +2341,13 @@ TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) { } TEST_F(DBTest, GetLiveBlobFiles) { + // Note: the following prevents an otherwise harmless data race between the + // test setup code (AddBlobFile) below and the periodic stat dumping thread. + Options options = CurrentOptions(); + options.stats_dump_period_sec = 0; + + Reopen(options); + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); assert(versions); assert(versions->GetColumnFamilySet()); @@ -2393,6 +2395,7 @@ TEST_F(DBTest, PurgeInfoLogs) { Options options = CurrentOptions(); options.keep_log_file_num = 5; options.create_if_missing = true; + options.env = env_; for (int mode = 0; mode <= 1; mode++) { if (mode == 1) { options.db_log_dir = dbname_ + "_logs"; @@ -3506,17 +3509,21 @@ TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) { } // Check that FIFO-with-TTL is not supported with max_open_files != -1. +// Github issue #8014 TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) { - Options options; + Options options = CurrentOptions(); options.compaction_style = kCompactionStyleFIFO; options.create_if_missing = true; options.ttl = 600; // seconds - // TTL is now supported with max_open_files != -1. + // TTL is not supported with max_open_files != -1. + options.max_open_files = 0; + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + options.max_open_files = 100; - options = CurrentOptions(options); - ASSERT_OK(TryReopen(options)); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + // TTL is supported with unlimited max_open_files options.max_open_files = -1; ASSERT_OK(TryReopen(options)); } @@ -3951,6 +3958,7 @@ TEST_F(DBTest, WriteSingleThreadEntry) { TEST_F(DBTest, ConcurrentFlushWAL) { const size_t cnt = 100; Options options; + options.env = env_; WriteOptions wopt; ReadOptions ropt; for (bool two_write_queues : {false, true}) { @@ -4619,6 +4627,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { Random rnd(301); Options options; + options.env = env_; options.create_if_missing = true; options.db_write_buffer_size = 20480; options.write_buffer_size = 20480; @@ -5021,6 +5030,7 @@ TEST_F(DBTest, DynamicFIFOCompactionOptions) { Options options; options.ttl = 0; options.create_if_missing = true; + options.env = env_; DestroyAndReopen(options); // Initial defaults @@ -5082,6 +5092,7 @@ TEST_F(DBTest, DynamicFIFOCompactionOptions) { TEST_F(DBTest, DynamicUniversalCompactionOptions) { Options options; options.create_if_missing = true; + options.env = env_; DestroyAndReopen(options); // Initial defaults @@ -5307,41 +5318,45 @@ TEST_F(DBTest, DynamicMiscOptions) { #endif // ROCKSDB_LITE TEST_F(DBTest, L0L1L2AndUpHitCounter) { + const int kNumLevels = 3; + const int kNumKeysPerLevel = 10000; + const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel; + Options options = CurrentOptions(); - options.write_buffer_size = 32 * 1024; - options.target_file_size_base = 32 * 1024; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.max_bytes_for_level_base = 64 * 1024; - options.max_write_buffer_number = 2; - options.max_background_compactions = 8; - options.max_background_flushes = 8; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - CreateAndReopenWithCF({"mypikachu"}, options); + Reopen(options); - int numkeys = 20000; - for (int i = 0; i < numkeys; i++) { - ASSERT_OK(Put(1, Key(i), "val")); + // After the below loop there will be one file on each of L0, L1, and L2. + int key = 0; + for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) { + for (int i = 0; i < kNumKeysPerLevel; ++i) { + ASSERT_OK(Put(Key(key), "val")); + key++; + } + ASSERT_OK(Flush()); + for (int input_level = 0; input_level < output_level; ++input_level) { + // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to + // `input_level + 1`. + ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr)); + } } + assert(key == kNumKeysPerDb); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); - ASSERT_OK(Flush(1)); - dbfull()->TEST_WaitForCompact(); - - for (int i = 0; i < numkeys; i++) { - ASSERT_EQ(Get(1, Key(i)), "val"); + for (int i = 0; i < kNumKeysPerDb; i++) { + ASSERT_EQ(Get(Key(i)), "val"); } - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); - ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); - ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + - TestGetTickerCount(options, GET_HIT_L1) + - TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); } TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { @@ -6697,20 +6712,19 @@ TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) { Reopen(options); Random rnd(301); bool memory_limit_exceeded = false; - uint64_t size_all_mem_table = 0; - uint64_t cur_active_mem = 0; + + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + for (int i = 0; i < 1000; i++) { std::string value = rnd.RandomString(1000); ASSERT_OK(Put("keykey_" + std::to_string(i), value)); dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_TRUE(db_->GetIntProperty(db_->DefaultColumnFamily(), - DB::Properties::kSizeAllMemTables, - &size_all_mem_table)); - ASSERT_TRUE(db_->GetIntProperty(db_->DefaultColumnFamily(), - DB::Properties::kCurSizeActiveMemTable, - &cur_active_mem)); + const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage(); + const uint64_t size_all_mem_table = + cur_active_mem + cfd->imm()->ApproximateMemoryUsage(); // Errors out if memory usage keeps on increasing beyond the limit. // Once memory limit exceeds, memory_limit_exceeded is set and if diff --git a/db/db_test2.cc b/db/db_test2.cc index ed0f469d0a6..f209cdd0384 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -9,9 +9,11 @@ #include #include #include +#include #include "db/db_test_util.h" #include "db/read_callback.h" +#include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/persistent_cache.h" @@ -40,9 +42,7 @@ TEST_F(DBTest2, OpenForReadOnly) { std::vector files; ASSERT_OK(env_->GetChildren(dbname, &files)); for (auto& f : files) { - if (f != "." && f != "..") { - ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); - } + ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); } // should be empty now and we should be able to delete it ASSERT_OK(env_->DeleteDir(dbname)); @@ -74,9 +74,7 @@ TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { std::vector files; ASSERT_OK(env_->GetChildren(dbname, &files)); for (auto& f : files) { - if (f != "." && f != "..") { - ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); - } + ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); } // should be empty now and we should be able to delete it ASSERT_OK(env_->DeleteDir(dbname)); @@ -104,6 +102,7 @@ class TestReadOnlyWithCompressedCache TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) { if (use_mmap_ && !IsMemoryMappedAccessSupported()) { + ROCKSDB_GTEST_SKIP("Test requires MMAP support"); return; } ASSERT_OK(Put("foo", "bar")); @@ -156,8 +155,14 @@ class PartitionedIndexTestListener : public EventListener { }; TEST_F(DBTest2, PartitionedIndexUserToInternalKey) { + const int kValueSize = 10500; + const int kNumEntriesPerFile = 1000; + const int kNumFiles = 3; + const int kNumDistinctKeys = 30; + BlockBasedTableOptions table_options; Options options = CurrentOptions(); + options.disable_auto_compactions = true; table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; PartitionedIndexTestListener* listener = new PartitionedIndexTestListener(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -166,13 +171,16 @@ TEST_F(DBTest2, PartitionedIndexUserToInternalKey) { Reopen(options); Random rnd(301); - for (int i = 0; i < 3000; i++) { - int j = i % 30; - std::string value = rnd.RandomString(10500); - ASSERT_OK(Put("keykey_" + std::to_string(j), value)); - snapshots.push_back(db_->GetSnapshot()); + for (int i = 0; i < kNumFiles; i++) { + for (int j = 0; j < kNumEntriesPerFile; j++) { + int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys; + std::string value = rnd.RandomString(kValueSize); + ASSERT_OK(Put("keykey_" + std::to_string(key_id), value)); + snapshots.push_back(db_->GetSnapshot()); + } + ASSERT_OK(Flush()); } - Flush(); + for (auto s : snapshots) { db_->ReleaseSnapshot(s); } @@ -291,7 +299,7 @@ TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); Put(1, "a", "begin"); @@ -337,6 +345,10 @@ class DBTestSharedWriteBufferAcrossCFs TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { Options options = CurrentOptions(); options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; // Avoid undeterministic value by malloc_usable_size(); // Force arena block size to 1 @@ -380,6 +392,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { // Create some data and flush "default" and "nikitich" so that they // are newer CFs created. + flush_listener->expected_flush_reason = FlushReason::kManualFlush; ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); Flush(3); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); @@ -390,6 +403,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(1)); + flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); if (cost_cache_) { ASSERT_GE(cache->GetUsage(), 256 * 1024); @@ -514,6 +528,10 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2"); Options options = CurrentOptions(); options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; // Avoid undeterministic value by malloc_usable_size(); // Force arena block size to 1 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -551,6 +569,7 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { }; // Trigger a flush on cf2 + flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(2, Key(1), DummyString(70000), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); @@ -1344,7 +1363,7 @@ TEST_F(DBTest2, PresetCompressionDictLocality) { options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); Random rnd(301); @@ -1389,6 +1408,236 @@ TEST_F(DBTest2, PresetCompressionDictLocality) { } } +class PresetCompressionDictTest + : public DBTestBase, + public testing::WithParamInterface> { + public: + PresetCompressionDictTest() + : DBTestBase("/db_test2", false /* env_do_fsync */), + compression_type_(std::get<0>(GetParam())), + bottommost_(std::get<1>(GetParam())) {} + + protected: + const CompressionType compression_type_; + const bool bottommost_; +}; + +INSTANTIATE_TEST_CASE_P( + DBTest2, PresetCompressionDictTest, + ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()), + ::testing::Bool())); + +TEST_P(PresetCompressionDictTest, Flush) { + // Verifies that dictionary is generated and written during flush only when + // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the + // size of the dictionary is within expectations according to the limit on + // buffering set by `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.memtable_factory.reset(new SpecialSkipListFactory(kKeysPerFile)); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (size_t i = 0; i <= kKeysPerFile; ++i) { + ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a + // compression dictionary exists since dictionaries would be preloaded when + // the flush finishes. + if (bottommost_) { + // Flush is never considered bottommost. This should change in the future + // since flushed files may have nothing underneath them, like the one in + // this test case. + ASSERT_EQ( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 0); + } else { + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 0); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in + // ZSTD's digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit + // after each block is built. + ASSERT_LE(TestGetTickerCount(options, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + 2 * kBlockLen); + } + } +} + +TEST_P(PresetCompressionDictTest, CompactNonBottommost) { + // Verifies that dictionary is generated and written during compaction to + // non-bottommost level only when `ColumnFamilyOptions::compression` enables + // dictionary. Also verifies the size of the dictionary is within expectations + // according to the limit on buffering set by + // `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + for (int i = 0; i < 2; ++i) { + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + } +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + uint64_t prev_compression_dict_bytes_inserted = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); + // This L0->L1 compaction merges the two L0 files into L1. The produced L1 + // file is not bottommost due to the existing L2 file covering the same key- + // range. + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a + // compression dictionary exists since dictionaries would be preloaded when + // the compaction finishes. + if (bottommost_) { + ASSERT_EQ( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + } else { + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in + // ZSTD's digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit + // after each block is built. + ASSERT_LE(TestGetTickerCount(options, + BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted + 2 * kBlockLen); + } + } +} + +TEST_P(PresetCompressionDictTest, CompactBottommost) { + // Verifies that dictionary is generated and written during compaction to + // non-bottommost level only when either `ColumnFamilyOptions::compression` or + // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also + // verifies the size of the dictionary is within expectations according to the + // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`. + const size_t kValueLen = 256; + const size_t kKeysPerFile = 1 << 10; + const size_t kDictLen = 16 << 10; + const size_t kBlockLen = 4 << 10; + + Options options = CurrentOptions(); + if (bottommost_) { + options.bottommost_compression = compression_type_; + options.bottommost_compression_opts.enabled = true; + options.bottommost_compression_opts.max_dict_bytes = kDictLen; + options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; + } else { + options.compression = compression_type_; + options.compression_opts.max_dict_bytes = kDictLen; + options.compression_opts.max_dict_buffer_bytes = kBlockLen; + } + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.block_size = kBlockLen; + bbto.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + Random rnd(301); + for (int i = 0; i < 2; ++i) { + for (size_t j = 0; j <= kKeysPerFile; ++j) { + ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); + } + ASSERT_OK(Flush()); + } +#ifndef ROCKSDB_LITE + ASSERT_EQ("2", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + uint64_t prev_compression_dict_bytes_inserted = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); + CompactRangeOptions cro; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + ASSERT_GT( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted); + // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on + // number of bytes needs to be adjusted in case the cached block is in ZSTD's + // digested dictionary format. + if (compression_type_ != kZSTD && + compression_type_ != kZSTDNotFinalCompression) { + // Although we limited buffering to `kBlockLen`, there may be up to two + // blocks of data included in the dictionary since we only check limit after + // each block is built. + ASSERT_LE( + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), + prev_compression_dict_bytes_inserted + 2 * kBlockLen); + } +} + class CompactionCompressionListener : public EventListener { public: explicit CompactionCompressionListener(Options* db_options) @@ -1470,7 +1719,7 @@ TEST_P(CompressionFailuresTest, CompressionFailures) { BlockBasedTableOptions table_options; table_options.block_size = 512; table_options.verify_compression = true; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.compression = compression_type_; options.compression_opts.parallel_threads = compression_parallel_threads_; @@ -1808,7 +2057,7 @@ class PinL0IndexAndFilterBlocksTest table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options->table_factory.reset(new BlockBasedTableFactory(table_options)); + options->table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, *options); Put(1, "a", "begin"); @@ -1848,7 +2097,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "key", "val")); @@ -2485,26 +2734,30 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { { const int kIdBufLen = 100; char id_buf[kIdBufLen]; + Status s = Status::NotSupported(); #ifndef OS_WIN // You can't open a directory on windows using random access file std::unique_ptr file; - ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions())); - if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { - // fs holding db directory doesn't support getting a unique file id, - // this means that running this test will fail because lru_cache will load - // the blocks again regardless of them being already in the cache - return; - } -#else - std::unique_ptr dir; - ASSERT_OK(env_->NewDirectory(dbname_, &dir)); - if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { - // fs holding db directory doesn't support getting a unique file id, - // this means that running this test will fail because lru_cache will load - // the blocks again regardless of them being already in the cache - return; + s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions()); + if (s.ok()) { + if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will + // load the blocks again regardless of them being already in the cache + return; + } } #endif + if (!s.ok()) { + std::unique_ptr dir; + ASSERT_OK(env_->NewDirectory(dbname_, &dir)); + if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { + // fs holding db directory doesn't support getting a unique file id, + // this means that running this test will fail because lru_cache will + // load the blocks again regardless of them being already in the cache + return; + } + } } uint32_t bytes_per_bit[2] = {1, 16}; for (size_t k = 0; k < 2; k++) { @@ -2949,6 +3202,180 @@ TEST_F(DBTest2, PausingManualCompaction4) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBTest2, CancelManualCompaction1) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + Flush(); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + int run_manual_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", + [&](void* /*arg*/) { run_manual_compactions++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Setup a callback to disable compactions after a couple of levels are + // compacted + int compactions_run = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", + [&](void* /*arg*/) { ++compactions_run; }); + + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + + // Since compactions are disabled, we shouldn't start compacting. + // E.g. we should call the compaction function exactly one time. + ASSERT_EQ(compactions_run, 0); + ASSERT_EQ(run_manual_compactions, 0); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + compactions_run = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { + ++compactions_run; + // After 3 compactions disable + if (compactions_run == 3) { + compact_options.canceled->store(true, std::memory_order_release); + } + }); + + compact_options.canceled->store(false, std::memory_order_release); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + + ASSERT_EQ(compactions_run, 3); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + + // Compactions should work again if we re-enable them.. + compact_options.canceled->store(false, std::memory_order_relaxed); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, CancelManualCompaction2) { + CompactRangeOptions compact_options; + auto canceledPtr = + std::unique_ptr>(new std::atomic{true}); + compact_options.canceled = canceledPtr.get(); + compact_options.max_subcompactions = 1; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); + } + Flush(); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + int compactions_run = 0; + std::atomic kv_compactions{0}; + int compactions_stopped_at = 0; + int kv_compactions_stopped_at = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { + ++compactions_run; + // After 3 compactions disable + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator:ProcessKV", [&](void* /*arg*/) { + int kv_compactions_run = + kv_compactions.fetch_add(1, std::memory_order_release); + if (kv_compactions_run == 5) { + compact_options.canceled->store(true, std::memory_order_release); + kv_compactions_stopped_at = kv_compactions_run; + compactions_stopped_at = compactions_run; + } + }); + + compact_options.canceled->store(false, std::memory_order_release); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + + // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to + // the canceled variable from the single compacting thread (via callback), + // this value is deterministically kv_compactions_stopped_at + 1. + ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1); + ASSERT_EQ(compactions_run, compactions_stopped_at); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionIterator::ProcessKV"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "DBImpl::RunManualCompaction()::1"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + + // Compactions should work again if we re-enable them.. + compact_options.canceled->store(false, std::memory_order_relaxed); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest2, OptimizeForPointLookup) { Options options = CurrentOptions(); Close(); @@ -3297,7 +3724,7 @@ TEST_F(DBTest2, RateLimitedCompactionReads) { BlockBasedTableOptions bbto; bbto.block_size = 16384; bbto.no_block_cache = true; - options.table_factory.reset(new BlockBasedTableFactory(bbto)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); for (int i = 0; i < kNumL0Files; ++i) { @@ -3342,6 +3769,7 @@ TEST_F(DBTest2, RateLimitedCompactionReads) { // is on levels higher than the new num_levels. TEST_F(DBTest2, ReduceLevel) { Options options; + options.env = env_; options.disable_auto_compactions = true; options.num_levels = 7; Reopen(options); @@ -3370,6 +3798,7 @@ TEST_F(DBTest2, ReadCallbackTest) { Options options; options.disable_auto_compactions = true; options.num_levels = 7; + options.env = env_; Reopen(options); std::vector snapshots; // Try to create a db with multiple layers and a memtable @@ -3503,20 +3932,26 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { TEST_F(DBTest2, TestNumPread) { Options options = CurrentOptions(); + bool prefetch_supported = + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); // disable block cache BlockBasedTableOptions table_options; table_options.no_block_cache = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); env_->count_random_reads_ = true; - env_->random_file_open_counter_.store(0); ASSERT_OK(Put("bar", "foo")); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); - // After flush, we'll open the file and read footer, meta block, - // property block and index block. - ASSERT_EQ(4, env_->random_read_counter_.Read()); + if (prefetch_supported) { + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + } else { + // With prefetch not supported, we will do a single read into a buffer + ASSERT_EQ(1, env_->random_read_counter_.Read()); + } ASSERT_EQ(1, env_->random_file_open_counter_.load()); // One pread per a normal data block read @@ -3532,19 +3967,30 @@ TEST_F(DBTest2, TestNumPread) { ASSERT_OK(Put("bar2", "foo2")); ASSERT_OK(Put("foo2", "bar2")); ASSERT_OK(Flush()); - // After flush, we'll open the file and read footer, meta block, - // property block and index block. - ASSERT_EQ(4, env_->random_read_counter_.Read()); + if (prefetch_supported) { + // After flush, we'll open the file and read footer, meta block, + // property block and index block. + ASSERT_EQ(4, env_->random_read_counter_.Read()); + } else { + // With prefetch not supported, we will do a single read into a buffer + ASSERT_EQ(1, env_->random_read_counter_.Read()); + } ASSERT_EQ(1, env_->random_file_open_counter_.load()); - // Compaction needs two input blocks, which requires 2 preads, and - // generate a new SST file which needs 4 preads (footer, meta block, - // property block and index block). In total 6. env_->random_file_open_counter_.store(0); env_->random_read_counter_.Reset(); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_EQ(6, env_->random_read_counter_.Read()); - // All compactin input files should have already been opened. + if (prefetch_supported) { + // Compaction needs two input blocks, which requires 2 preads, and + // generate a new SST file which needs 4 preads (footer, meta block, + // property block and index block). In total 6. + ASSERT_EQ(6, env_->random_read_counter_.Read()); + } else { + // With prefetch off, compaction needs two input blocks, + // followed by a single buffered read. In total 3. + ASSERT_EQ(3, env_->random_read_counter_.Read()); + } + // All compaction input files should have already been opened. ASSERT_EQ(1, env_->random_file_open_counter_.load()); // One pread per a normal data block read @@ -3629,7 +4075,9 @@ TEST_F(DBTest2, TraceAndReplay) { column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3704,7 +4152,9 @@ TEST_F(DBTest2, TraceWithLimit) { column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3772,7 +4222,9 @@ TEST_F(DBTest2, TraceWithSampling) { column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); @@ -3850,7 +4302,7 @@ TEST_F(DBTest2, TraceWithFilter) { // Open another db, replay, and verify the data std::string value; - std::string dbname2 = test::TmpDir(env_) + "/db_replay"; + std::string dbname2 = test::PerThreadDBPath(env_, "db_replay"); ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime @@ -3872,7 +4324,9 @@ TEST_F(DBTest2, TraceWithFilter) { column_families.push_back( ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2)); + DBOptions db_opts; + db_opts.env = env_; + ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3899,7 +4353,7 @@ TEST_F(DBTest2, TraceWithFilter) { ASSERT_OK(DestroyDB(dbname2, options)); // Set up a new db. - std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read"; + std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read"); ASSERT_OK(DestroyDB(dbname3, options)); DB* db3_init = nullptr; @@ -3918,7 +4372,7 @@ TEST_F(DBTest2, TraceWithFilter) { handles.clear(); DB* db3 = nullptr; - ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3)); + ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist @@ -3974,6 +4428,11 @@ TEST_F(DBTest2, TraceWithFilter) { TEST_F(DBTest2, PinnableSliceAndMmapReads) { Options options = CurrentOptions(); + options.env = env_; + if (!IsMemoryMappedAccessSupported()) { + ROCKSDB_GTEST_SKIP("Test requires default environment"); + return; + } options.allow_mmap_reads = true; options.max_open_files = 100; options.compression = kNoCompression; @@ -4026,7 +4485,7 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { bbto.cache_index_and_filter_blocks = false; bbto.block_cache = NewLRUCache(100000); bbto.block_size = 400; // small block size - options.table_factory.reset(new BlockBasedTableFactory(bbto)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); Random rnd(301); @@ -4252,6 +4711,7 @@ TEST_F(DBTest2, TestCompactFiles) { SyncPoint::GetInstance()->EnableProcessing(); Options options; + options.env = env_; options.num_levels = 2; options.disable_auto_compactions = true; Reopen(options); @@ -4310,7 +4770,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) { Options options = CurrentOptions(); std::vector dbnames; for (int i = 0; i < kNumDbs; ++i) { - dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i)); + dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i))); ASSERT_OK(DestroyDB(dbnames.back(), options)); } @@ -4376,8 +4836,8 @@ class DummyOldStats : public Statistics { } bool HistEnabledForType(uint32_t /*type*/) const override { return false; } std::string ToString() const override { return ""; } - int num_rt = 0; - int num_mt = 0; + std::atomic num_rt{0}; + std::atomic num_mt{0}; }; } // namespace @@ -4569,7 +5029,7 @@ TEST_F(DBTest2, CrashInRecoveryMultipleCF) { for (const auto& f : filenames) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) { + if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) { std::string fname = dbname_ + "/" + f; std::string file_content; ASSERT_OK(ReadFileToString(env_, fname, &file_content)); @@ -4703,7 +5163,7 @@ TEST_F(DBTest2, SameSmallestInSameLevel) { ASSERT_OK(Put("key", "2")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "3")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "4")); - Flush(); + ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; @@ -4807,6 +5267,7 @@ TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) { TEST_F(DBTest2, PartitionedIndexPrefetchFailure) { Options options = last_options_; + options.env = env_; options.max_open_files = 20; BlockBasedTableOptions bbto; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; @@ -5153,6 +5614,120 @@ TEST_F(DBTest2, AutoPrefixMode1) { ASSERT_EQ("a1", iterator->key().ToString()); } } + +class RenameCurrentTest : public DBTestBase, + public testing::WithParamInterface { + public: + RenameCurrentTest() + : DBTestBase("rename_current_test", /*env_do_fsync=*/true), + sync_point_(GetParam()) {} + + ~RenameCurrentTest() override {} + + void SetUp() override { + env_->no_file_overwrite_.store(true, std::memory_order_release); + } + + void TearDown() override { + env_->no_file_overwrite_.store(false, std::memory_order_release); + } + + void SetupSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) { + Status* s = reinterpret_cast(arg); + assert(s); + *s = Status::IOError("Injected IO error."); + }); + } + + const std::string sync_point_; +}; + +INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest, + ::testing::Values("SetCurrentFile:BeforeRename", + "SetCurrentFile:AfterRename")); + +TEST_P(RenameCurrentTest, Open) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + Status s = TryReopen(options); + ASSERT_NOK(s); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); +} + +TEST_P(RenameCurrentTest, Flush) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.max_manifest_file_size = 1; + options.create_if_missing = true; + Reopen(options); + ASSERT_OK(Put("key", "value")); + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(Flush()); + + ASSERT_NOK(Put("foo", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); + ASSERT_EQ("value", Get("key")); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST_P(RenameCurrentTest, Compaction) { + Destroy(last_options_); + Options options = GetDefaultOptions(); + options.max_manifest_file_size = 1; + options.create_if_missing = true; + Reopen(options); + ASSERT_OK(Put("a", "a_value")); + ASSERT_OK(Put("c", "c_value")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b", "b_value")); + ASSERT_OK(Put("d", "d_value")); + ASSERT_OK(Flush()); + + SetupSyncPoints(); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, + /*end=*/nullptr)); + + ASSERT_NOK(Put("foo", "value")); + + SyncPoint::GetInstance()->DisableProcessing(); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("foo")); + ASSERT_EQ("d_value", Get("d")); +} + +TEST_F(DBTest2, BottommostTemperature) { + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Reopen(options); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature); +} #endif // ROCKSDB_LITE // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. @@ -5180,6 +5755,34 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { Status s = TryReopen(options); ASSERT_TRUE(s.IsIOError()); } + +TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:Start:1", + "PointInTimeRecoveryWithSyncFailureInCFCreation:1"}, + {"PointInTimeRecoveryWithSyncFailureInCFCreation:2", + "DBImpl::BackgroundCallFlush:Start:2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"test1"}, Options()); + ASSERT_OK(Put("foo", "bar")); + + // Creating a CF when a flush is going on, log is synced but the + // closed log file is not synced and corrupted. + port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); }); + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1"); + CreateColumnFamilies({"test2"}, Options()); + env_->corrupt_in_sync_ = true; + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2"); + flush_thread.join(); + env_->corrupt_in_sync_ = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Reopening the DB should not corrupt anything + Options options = CurrentOptions(); + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ReopenWithColumnFamilies({"default", "test1", "test2"}, options); +} } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/db_test_util.cc b/db/db_test_util.cc index dd79a71950c..1d81774815a 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -10,6 +10,7 @@ #include "db/db_test_util.h" #include "db/forward_iterator.h" +#include "env/mock_env.h" #include "rocksdb/convenience.h" #include "rocksdb/env_encryption.h" #include "util/stderr_logger.h" @@ -52,6 +53,7 @@ SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep) manifest_sync_error_.store(false, std::memory_order_release); manifest_write_error_.store(false, std::memory_order_release); log_write_error_.store(false, std::memory_order_release); + no_file_overwrite_.store(false, std::memory_order_release); random_file_open_counter_.store(0, std::memory_order_relaxed); delete_count_.store(0, std::memory_order_relaxed); num_open_wal_file_.store(0); @@ -70,16 +72,8 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) option_config_(kDefault), s3_env_(nullptr) { Env* base_env = Env::Default(); -#ifndef ROCKSDB_LITE - const char* test_env_uri = getenv("TEST_ENV_URI"); - if (test_env_uri) { - Env* test_env = nullptr; - Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); - base_env = test_env; - EXPECT_OK(s); - EXPECT_NE(Env::Default(), base_env); - } -#endif // !ROCKSDB_LITE + ConfigOptions config_options; + EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_)); EXPECT_NE(nullptr, base_env); if (getenv("MEM_ENV")) { mem_env_ = new MockEnv(base_env); @@ -88,7 +82,7 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) if (getenv("ENCRYPTED_ENV")) { std::shared_ptr provider; Status s = EncryptionProvider::CreateFromString( - ConfigOptions(), std::string("test://") + getenv("ENCRYPTED_ENV"), + config_options, std::string("test://") + getenv("ENCRYPTED_ENV"), &provider); encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider); } @@ -254,28 +248,28 @@ bool DBTestBase::ChangeCompactOptions() { Destroy(last_options_); auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kUniversalCompaction) { option_config_ = kUniversalCompactionMultiLevel; Destroy(last_options_); auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kUniversalCompactionMultiLevel) { option_config_ = kLevelSubcompactions; Destroy(last_options_); auto options = CurrentOptions(); assert(options.max_subcompactions > 1); - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kLevelSubcompactions) { option_config_ = kUniversalSubcompactions; Destroy(last_options_); auto options = CurrentOptions(); assert(options.max_subcompactions > 1); - TryReopen(options); + Reopen(options); return true; } else { return false; @@ -290,7 +284,7 @@ bool DBTestBase::ChangeWalOptions() { auto options = CurrentOptions(); Destroy(options); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kDBLogDir) { option_config_ = kWalDirAndMmapReads; @@ -298,14 +292,14 @@ bool DBTestBase::ChangeWalOptions() { auto options = CurrentOptions(); Destroy(options); options.create_if_missing = true; - TryReopen(options); + Reopen(options); return true; } else if (option_config_ == kWalDirAndMmapReads) { option_config_ = kRecycleLogFiles; Destroy(last_options_); auto options = CurrentOptions(); Destroy(options); - TryReopen(options); + Reopen(options); return true; } else { return false; @@ -385,7 +379,7 @@ Options DBTestBase::CurrentOptions( return GetOptions(option_config_, default_options, options_override); } -Options DBTestBase::GetDefaultOptions() { +Options DBTestBase::GetDefaultOptions() const { Options options; options.write_buffer_size = 4090 * 4096; options.target_file_size_base = 2 * 1024 * 1024; @@ -393,6 +387,10 @@ Options DBTestBase::GetDefaultOptions() { options.max_open_files = 5000; options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; options.compaction_pri = CompactionPri::kByCompensatedSize; + options.env = env_; + if (!env_->skip_fsync_) { + options.track_and_verify_wals_in_manifest = true; + } return options; } @@ -421,28 +419,28 @@ Options DBTestBase::GetOptions( options.unordered_write = false; break; case kPlainTableFirstBytePrefix: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.allow_mmap_reads = can_allow_mmap; options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; case kPlainTableCappedPrefix: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewCappedPrefixTransform(8)); options.allow_mmap_reads = can_allow_mmap; options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; case kPlainTableCappedPrefixNonMmap: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewCappedPrefixTransform(8)); options.allow_mmap_reads = false; options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; case kPlainTableAllBytesPrefix: - options.table_factory.reset(new PlainTableFactory()); + options.table_factory.reset(NewPlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); options.allow_mmap_reads = can_allow_mmap; options.max_sequential_skip_in_iterations = 999999; @@ -536,6 +534,7 @@ Options DBTestBase::GetOptions( } case kFIFOCompaction: { options.compaction_style = kCompactionStyleFIFO; + options.max_open_files = -1; break; } case kBlockBasedTableWithPrefixHashIndex: { @@ -549,6 +548,7 @@ Options DBTestBase::GetOptions( break; } case kBlockBasedTableWithPartitionedIndex: { + table_options.format_version = 3; table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; options.prefix_extractor.reset(NewNoopTransform()); break; @@ -774,7 +774,7 @@ void DBTestBase::Close() { void DBTestBase::DestroyAndReopen(const Options& options) { // Destroy using last options Destroy(last_options_); - ASSERT_OK(TryReopen(options)); + Reopen(options); } void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) { @@ -819,9 +819,9 @@ Status DBTestBase::TryReopen(const Options& options) { // Note: operator= is an unsafe approach here since it destructs // std::shared_ptr in the same order of their creation, in contrast to // destructors which destructs them in the opposite order of creation. One - // particular problme is that the cache destructor might invoke callback + // particular problem is that the cache destructor might invoke callback // functions that use Option members such as statistics. To work around this - // problem, we manually call destructor of table_facotry which eventually + // problem, we manually call destructor of table_factory which eventually // clears the block cache. last_options_ = options; MaybeInstallTimeElapseOnlySleep(options); @@ -1078,7 +1078,8 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { bool first = true; while (iter->Valid()) { ParsedInternalKey ikey(Slice(), 0, kTypeValue); - if (ParseInternalKey(iter->key(), &ikey) != Status::OK()) { + if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) != + Status::OK()) { result += "CORRUPTED"; } else { if (!last_options_.comparator->Equal(ikey.user_key, user_key)) { @@ -1235,29 +1236,77 @@ std::string DBTestBase::FilesPerLevel(int cf) { result.resize(last_non_zero_offset); return result; } + #endif // !ROCKSDB_LITE +std::vector DBTestBase::GetBlobFileNumbers() { + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + assert(versions); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + + Version* const current = cfd->current(); + assert(current); + + const VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const auto& blob_files = storage_info->GetBlobFiles(); + + std::vector result; + result.reserve(blob_files.size()); + + for (const auto& blob_file : blob_files) { + result.emplace_back(blob_file.first); + } + + return result; +} + size_t DBTestBase::CountFiles() { + size_t count = 0; std::vector files; - env_->GetChildren(dbname_, &files); + if (env_->GetChildren(dbname_, &files).ok()) { + count += files.size(); + } - std::vector logfiles; if (dbname_ != last_options_.wal_dir) { - env_->GetChildren(last_options_.wal_dir, &logfiles); + if (env_->GetChildren(last_options_.wal_dir, &files).ok()) { + count += files.size(); + } } - return files.size() + logfiles.size(); + return count; +}; + +Status DBTestBase::CountFiles(size_t* count) { + std::vector files; + Status s = env_->GetChildren(dbname_, &files); + if (!s.ok()) { + return s; + } + size_t files_count = files.size(); + + if (dbname_ != last_options_.wal_dir) { + s = env_->GetChildren(last_options_.wal_dir, &files); + if (!s.ok()) { + return s; + } + *count = files_count + files.size(); + } + + return Status::OK(); } -uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) { +Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf, + uint64_t* size) { Range r(start, limit); - uint64_t size; if (cf == 0) { - db_->GetApproximateSizes(&r, 1, &size); + return db_->GetApproximateSizes(&r, 1, size); } else { - db_->GetApproximateSizes(handles_[1], &r, 1, &size); + return db_->GetApproximateSizes(handles_[1], &r, 1, size); } - return size; } void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit, @@ -1376,8 +1425,8 @@ void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) { } ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200))); if (!nowait) { - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } @@ -1485,12 +1534,12 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) { kMaxSequenceNumber)); } iter->SeekToFirst(); - ASSERT_EQ(iter->status().ok(), true); + ASSERT_OK(iter->status()); int seq = numValues; while (iter->Valid()) { ParsedInternalKey ikey; ikey.clear(); - ASSERT_OK(ParseInternalKey(iter->key(), &ikey)); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); // checks sequence number for updates ASSERT_EQ(ikey.sequence, (unsigned)seq--); @@ -1523,26 +1572,26 @@ void DBTestBase::CopyFile(const std::string& source, ASSERT_OK(destfile->Close()); } -Status DBTestBase::GetAllSSTFiles( - std::unordered_map* sst_files, +Status DBTestBase::GetAllDataFiles( + const FileType file_type, std::unordered_map* files, uint64_t* total_size /* = nullptr */) { if (total_size) { *total_size = 0; } - std::vector files; - Status s = env_->GetChildren(dbname_, &files); + std::vector children; + Status s = env_->GetChildren(dbname_, &children); if (s.ok()) { - for (auto& file_name : files) { + for (auto& file_name : children) { uint64_t number; FileType type; - if (ParseFileName(file_name, &number, &type) && type == kTableFile) { + if (ParseFileName(file_name, &number, &type) && type == file_type) { std::string file_path = dbname_ + "/" + file_name; uint64_t file_size = 0; s = env_->GetFileSize(file_path, &file_size); if (!s.ok()) { break; } - (*sst_files)[file_path] = file_size; + (*files)[file_path] = file_size; if (total_size) { *total_size += file_size; } @@ -1695,7 +1744,7 @@ void DBTestBase::VerifyDBInternal( for (auto p : true_data) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey ikey; - ASSERT_OK(ParseInternalKey(iter->key(), &ikey)); + ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); ASSERT_EQ(p.first, ikey.user_key); ASSERT_EQ(p.second, iter->value()); iter->Next(); diff --git a/db/db_test_util.h b/db/db_test_util.h index a78a37eea6a..bcb93c8055d 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -23,7 +23,6 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" -#include "env/mock_env.h" #include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "rocksdb/cache.h" @@ -40,7 +39,6 @@ #include "rocksdb/utilities/checkpoint.h" #include "table/mock_table.h" #include "table/scoped_arena_iterator.h" -#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "util/cast_util.h" @@ -52,6 +50,7 @@ extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery(); namespace ROCKSDB_NAMESPACE { +class MockEnv; namespace anon { class AtomicCounter { @@ -232,6 +231,11 @@ class SpecialEnv : public EnvWrapper { return base_->Append(data); } } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } Status PositionedAppend(const Slice& data, uint64_t offset) override { if (env_->table_write_callback_) { (*env_->table_write_callback_)(); @@ -246,6 +250,11 @@ class SpecialEnv : public EnvWrapper { return base_->PositionedAppend(data, offset); } } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& /* verification_info */) override { + return PositionedAppend(data, offset); + } Status Truncate(uint64_t size) override { return base_->Truncate(size); } Status RangeSync(uint64_t offset, uint64_t nbytes) override { Status s = base_->RangeSync(offset, nbytes); @@ -296,6 +305,9 @@ class SpecialEnv : public EnvWrapper { Status Allocate(uint64_t offset, uint64_t len) override { return base_->Allocate(offset, len); } + size_t GetUniqueId(char* id, size_t max_size) const override { + return base_->GetUniqueId(id, max_size); + } }; class ManifestFile : public WritableFile { public: @@ -308,6 +320,12 @@ class SpecialEnv : public EnvWrapper { return base_->Append(data); } } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } + Status Truncate(uint64_t size) override { return base_->Truncate(size); } Status Close() override { return base_->Close(); } Status Flush() override { return base_->Flush(); } @@ -359,15 +377,26 @@ class SpecialEnv : public EnvWrapper { #endif return s; } + Status Append( + const Slice& data, + const DataVerificationInfo& /* verification_info */) override { + return Append(data); + } Status Truncate(uint64_t size) override { return base_->Truncate(size); } + void PrepareWrite(size_t offset, size_t len) override { + base_->PrepareWrite(offset, len); + } + void SetPreallocationBlockSize(size_t size) override { + base_->SetPreallocationBlockSize(size); + } Status Close() override { // SyncPoint is not supported in Released Windows Mode. #if !(defined NDEBUG) || !defined(OS_WIN) // Check preallocation size - // preallocation size is never passed to base file. - size_t preallocation_size = preallocation_block_size(); + size_t block_size, last_allocated_block; + base_->GetPreallocationStatus(&block_size, &last_allocated_block); TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus", - &preallocation_size); + &block_size); #endif // !(defined NDEBUG) || !defined(OS_WIN) return base_->Close(); @@ -375,6 +404,10 @@ class SpecialEnv : public EnvWrapper { Status Flush() override { return base_->Flush(); } Status Sync() override { ++env_->sync_counter_; + if (env_->corrupt_in_sync_) { + Append(std::string(33000, ' ')); + return Status::IOError("Ingested Sync Failure"); + } if (env_->skip_fsync_) { return Status::OK(); } else { @@ -397,6 +430,11 @@ class SpecialEnv : public EnvWrapper { OtherFile(SpecialEnv* env, std::unique_ptr&& b) : env_(env), base_(std::move(b)) {} Status Append(const Slice& data) override { return base_->Append(data); } + Status Append( + const Slice& data, + const DataVerificationInfo& /*verification_info*/) override { + return Append(data); + } Status Truncate(uint64_t size) override { return base_->Truncate(size); } Status Close() override { return base_->Close(); } Status Flush() override { return base_->Flush(); } @@ -417,6 +455,11 @@ class SpecialEnv : public EnvWrapper { std::unique_ptr base_; }; + if (no_file_overwrite_.load(std::memory_order_acquire) && + target()->FileExists(f).ok()) { + return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true."); + } + if (non_writeable_rate_.load(std::memory_order_acquire) > 0) { uint32_t random_number; { @@ -664,6 +707,9 @@ class SpecialEnv : public EnvWrapper { // Slow down every log write, in micro-seconds. std::atomic log_write_slowdown_; + // If true, returns Status::NotSupported for file overwrite. + std::atomic no_file_overwrite_; + // Number of WAL files that are still open for write. std::atomic num_open_wal_file_; @@ -686,6 +732,9 @@ class SpecialEnv : public EnvWrapper { // If true, all fsync to files and directories are skipped. bool skip_fsync_ = false; + // If true, ingest the corruption to file during sync. + bool corrupt_in_sync_ = false; + std::atomic non_writeable_rate_; std::atomic new_writable_count_; @@ -738,6 +787,17 @@ class OnFileDeletionListener : public EventListener { size_t matched_count_; std::string expected_file_name_; }; + +class FlushCounterListener : public EventListener { + public: + std::atomic count{0}; + std::atomic expected_flush_reason{FlushReason::kOthers}; + + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { + count++; + ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason); + } +}; #endif // A test merge operator mimics put but also fails if one of merge operands is @@ -771,6 +831,7 @@ class CacheWrapper : public Cache { const char* Name() const override { return target_->Name(); } + using Cache::Insert; Status Insert(const Slice& key, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), Handle** handle = nullptr, @@ -778,12 +839,14 @@ class CacheWrapper : public Cache { return target_->Insert(key, value, charge, deleter, handle, priority); } + using Cache::Lookup; Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { return target_->Lookup(key, stats); } bool Ref(Handle* handle) override { return target_->Ref(handle); } + using Cache::Release; bool Release(Handle* handle, bool force_erase = false) override { return target_->Release(handle, force_erase); } @@ -817,11 +880,22 @@ class CacheWrapper : public Cache { return target_->GetCharge(handle); } + DeleterFn GetDeleter(Handle* handle) const override { + return target_->GetDeleter(handle); + } + void ApplyToAllCacheEntries(void (*callback)(void*, size_t), bool thread_safe) override { target_->ApplyToAllCacheEntries(callback, thread_safe); } + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + target_->ApplyToAllEntries(callback, opts); + } + void EraseUnRefEntries() override { target_->EraseUnRefEntries(); } protected: @@ -967,10 +1041,13 @@ class DBTestBase : public testing::Test { const anon::OptionsOverride& options_override = anon::OptionsOverride()) const; - static Options GetDefaultOptions(); + Options GetDefaultOptions() const; + + Options GetOptions(int option_config) const { + return GetOptions(option_config, GetDefaultOptions()); + } - Options GetOptions(int option_config, - const Options& default_options = GetDefaultOptions(), + Options GetOptions(int option_config, const Options& default_options, const anon::OptionsOverride& options_override = anon::OptionsOverride()) const; @@ -1080,12 +1157,20 @@ class DBTestBase : public testing::Test { int TotalTableFiles(int cf = 0, int levels = -1); #endif // ROCKSDB_LITE + std::vector GetBlobFileNumbers(); + // Return spread of files per level std::string FilesPerLevel(int cf = 0); size_t CountFiles(); - uint64_t Size(const Slice& start, const Slice& limit, int cf = 0); + Status CountFiles(size_t* count); + + Status Size(const Slice& start, const Slice& limit, uint64_t* size) { + return Size(start, limit, 0, size); + } + + Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size); void Compact(int cf, const Slice& start, const Slice& limit, uint32_t target_path_id); @@ -1163,8 +1248,9 @@ class DBTestBase : public testing::Test { void CopyFile(const std::string& source, const std::string& destination, uint64_t size = 0); - Status GetAllSSTFiles(std::unordered_map* sst_files, - uint64_t* total_size = nullptr); + Status GetAllDataFiles(const FileType file_type, + std::unordered_map* sst_files, + uint64_t* total_size = nullptr); std::vector ListTableFiles(Env* env, const std::string& path); diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 119fc66c508..548b8ae0e08 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -996,7 +996,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) { // First, clean up memtable before inserting new data. This will generate // a level-0 file, with size around 0.4 (according to previously written // data amount). - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); for (int num = 0; num < options.level0_file_num_compaction_trigger - 3; num++) { // Write 110KB (11 values, each 10K) @@ -1781,7 +1781,7 @@ TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) { dbfull()->TEST_write_controler().GetCompactionPressureToken(); ASSERT_OK(Put("key", "val")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1); ColumnFamilyMetaData cf_meta; @@ -1807,7 +1807,7 @@ TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) { "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"); for (int i = 0; i < 2; ++i) { ASSERT_OK(Put("key", "val")); - Flush(); + ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); @@ -1911,7 +1911,7 @@ TEST_F(DBTestUniversalCompaction2, BasicL0toL1) { for (i = 0; i < 2000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); // MoveFilesToLevel(6); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -1923,7 +1923,7 @@ TEST_F(DBTestUniversalCompaction2, BasicL0toL1) { ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -1954,7 +1954,7 @@ TEST_F(DBTestUniversalCompaction2, SingleLevel) { for (i = 0; i < 2000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 1999; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && @@ -1964,7 +1964,7 @@ TEST_F(DBTestUniversalCompaction2, SingleLevel) { ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()(; ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, NumTableFilesAtLevel(0)); @@ -1992,19 +1992,19 @@ TEST_F(DBTestUniversalCompaction2, MultipleLevels) { for (i = 0; i < 500; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 500; i < 1000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 1000; i < 1500; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 1500; i < 2000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -2013,15 +2013,15 @@ TEST_F(DBTestUniversalCompaction2, MultipleLevels) { for (i = 1999; i < 2333; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2333; i < 2666; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2666; i < 2999; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -2031,7 +2031,7 @@ TEST_F(DBTestUniversalCompaction2, MultipleLevels) { for (i = 1900; i < 2100; ++i) { ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -2064,19 +2064,19 @@ TEST_F(DBTestUniversalCompaction2, OverlappingL0) { for (i = 0; i < 2000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2000; i < 3000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 3500; i < 4000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); for (i = 2900; i < 3100; ++i) { ASSERT_OK(Delete(Key(i))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(2, NumTableFilesAtLevel(0)); @@ -2106,7 +2106,7 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) { for (i = 0; i < 2000; ++i) { ASSERT_OK(Put(Key(i), "val")); } - Flush(); + ASSERT_OK(Flush()); // MoveFilesToLevel(6); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -2118,7 +2118,7 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) { ASSERT_OK(Put(Key(i), "val")); } } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); @@ -2129,7 +2129,7 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) { TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { Options options; options.compaction_style = kCompactionStyleUniversal; - + options.env = env_; KeepFilterFactory* filter = new KeepFilterFactory(true); options.compaction_filter_factory.reset(filter); Reopen(options); @@ -2184,7 +2184,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { // Case 1: Oldest flushed file excceeds periodic compaction threshold. ASSERT_OK(Put("foo", "bar")); - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ(0, periodic_compactions); // Move clock forward so that the flushed file would qualify periodic // compaction. @@ -2192,7 +2192,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { // Another flush would trigger compaction the oldest file. ASSERT_OK(Put("foo", "bar2")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, periodic_compactions); @@ -2203,7 +2203,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { periodic_compactions = 0; // A flush doesn't trigger a periodic compaction when threshold not hit ASSERT_OK(Put("foo", "bar2")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, periodic_compactions); @@ -2211,7 +2211,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { // a compaction ASSERT_OK(Put("foo", "bar2")); env_->MockSleepForSeconds(48 * 60 * 60 + 100); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(1, periodic_compactions); ASSERT_EQ(0, start_level); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 69940840fa9..dd632742e2b 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -8,12 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_test_util.h" -#include "env/composite_env_wrapper.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/file_system.h" #include "test_util/sync_point.h" #include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { class DBWALTestBase : public DBTestBase { @@ -23,13 +24,37 @@ class DBWALTestBase : public DBTestBase { #if defined(ROCKSDB_PLATFORM_POSIX) public: +#if defined(ROCKSDB_FALLOCATE_PRESENT) + bool IsFallocateSupported() { + // Test fallocate support of running file system. + // Skip this test if fallocate is not supported. + std::string fname_test_fallocate = dbname_ + "/preallocate_testfile"; + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + assert(fd > 0); + int alloc_status = fallocate(fd, 0, 0, 1); + int err_number = errno; + close(fd); + assert(env_->DeleteFile(fname_test_fallocate) == Status::OK()); + if (err_number == ENOSYS || err_number == EOPNOTSUPP) { + fprintf(stderr, "Skipped preallocated space check: %s\n", + errnoStr(err_number).c_str()); + return false; + } + assert(alloc_status == 0); + return true; + } +#endif // ROCKSDB_FALLOCATE_PRESENT + uint64_t GetAllocatedFileSize(std::string file_name) { struct stat sbuf; int err = stat(file_name.c_str(), &sbuf); assert(err == 0); return sbuf.st_blocks * 512; } -#endif +#endif // ROCKSDB_PLATFORM_POSIX }; class DBWALTest : public DBWALTestBase { @@ -47,8 +72,8 @@ class EnrichedSpecialEnv : public SpecialEnv { InstrumentedMutexLock l(&env_mutex_); if (f == skipped_wal) { deleted_wal_reopened = true; - if (IsWAL(f) && largetest_deleted_wal.size() != 0 && - f.compare(largetest_deleted_wal) <= 0) { + if (IsWAL(f) && largest_deleted_wal.size() != 0 && + f.compare(largest_deleted_wal) <= 0) { gap_in_wals = true; } } @@ -62,9 +87,9 @@ class EnrichedSpecialEnv : public SpecialEnv { // remember its name partly because the application might attempt to // delete the file again. if (skipped_wal.size() != 0 && skipped_wal != fname) { - if (largetest_deleted_wal.size() == 0 || - largetest_deleted_wal.compare(fname) < 0) { - largetest_deleted_wal = fname; + if (largest_deleted_wal.size() == 0 || + largest_deleted_wal.compare(fname) < 0) { + largest_deleted_wal = fname; } } else { skipped_wal = fname; @@ -82,7 +107,7 @@ class EnrichedSpecialEnv : public SpecialEnv { // the wal whose actual delete was skipped by the env std::string skipped_wal = ""; // the largest WAL that was requested to be deleted - std::string largetest_deleted_wal = ""; + std::string largest_deleted_wal = ""; // number of WALs that were successfully deleted std::atomic deleted_wal_cnt = {0}; // the WAL whose delete from fs was skipped is reopened during recovery @@ -358,16 +383,16 @@ TEST_F(DBWALTest, RecoverWithBlob) { // There should be no files just yet since we haven't flushed. { VersionSet* const versions = dbfull()->TEST_GetVersionSet(); - assert(versions); + ASSERT_NE(versions, nullptr); ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); - assert(cfd); + ASSERT_NE(cfd, nullptr); Version* const current = cfd->current(); - assert(current); + ASSERT_NE(current, nullptr); const VersionStorageInfo* const storage_info = current->storage_info(); - assert(storage_info); + ASSERT_NE(storage_info, nullptr); ASSERT_EQ(storage_info->num_non_empty_levels(), 0); ASSERT_TRUE(storage_info->GetBlobFiles().empty()); @@ -380,37 +405,36 @@ TEST_F(DBWALTest, RecoverWithBlob) { options.min_blob_size = min_blob_size; options.avoid_flush_during_recovery = false; options.disable_auto_compactions = true; + options.env = env_; Reopen(options); ASSERT_EQ(Get("key1"), short_value); - - // TODO: enable once Get support is implemented for blobs - // ASSERT_EQ(Get("key2"), long_value); + ASSERT_EQ(Get("key2"), long_value); VersionSet* const versions = dbfull()->TEST_GetVersionSet(); - assert(versions); + ASSERT_NE(versions, nullptr); ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); - assert(cfd); + ASSERT_NE(cfd, nullptr); Version* const current = cfd->current(); - assert(current); + ASSERT_NE(current, nullptr); const VersionStorageInfo* const storage_info = current->storage_info(); - assert(storage_info); + ASSERT_NE(storage_info, nullptr); const auto& l0_files = storage_info->LevelFiles(0); ASSERT_EQ(l0_files.size(), 1); const FileMetaData* const table_file = l0_files[0]; - assert(table_file); + ASSERT_NE(table_file, nullptr); const auto& blob_files = storage_info->GetBlobFiles(); ASSERT_EQ(blob_files.size(), 1); const auto& blob_file = blob_files.begin()->second; - assert(blob_file); + ASSERT_NE(blob_file, nullptr); ASSERT_EQ(table_file->smallest.user_key(), "key1"); ASSERT_EQ(table_file->largest.user_key(), "key2"); @@ -423,29 +447,177 @@ TEST_F(DBWALTest, RecoverWithBlob) { #ifndef ROCKSDB_LITE const InternalStats* const internal_stats = cfd->internal_stats(); - assert(internal_stats); - - const uint64_t expected_bytes = - table_file->fd.GetFileSize() + blob_file->GetTotalBlobBytes(); + ASSERT_NE(internal_stats, nullptr); const auto& compaction_stats = internal_stats->TEST_GetCompactionStats(); ASSERT_FALSE(compaction_stats.empty()); - ASSERT_EQ(compaction_stats[0].bytes_written, expected_bytes); - ASSERT_EQ(compaction_stats[0].num_output_files, 2); + ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize()); + ASSERT_EQ(compaction_stats[0].bytes_written_blob, + blob_file->GetTotalBlobBytes()); + ASSERT_EQ(compaction_stats[0].num_output_files, 1); + ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1); const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue(); - ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], expected_bytes); + ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], + compaction_stats[0].bytes_written + + compaction_stats[0].bytes_written_blob); #endif // ROCKSDB_LITE } +TEST_F(DBWALTest, RecoverWithBlobMultiSST) { + // Write several large (4 KB) values without flushing. Note that blob files + // are not actually enabled at this point. + std::string large_value(1 << 12, 'a'); + + constexpr int num_keys = 64; + + for (int i = 0; i < num_keys; ++i) { + ASSERT_OK(Put(Key(i), large_value)); + } + + // There should be no files just yet since we haven't flushed. + { + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + ASSERT_EQ(storage_info->num_non_empty_levels(), 0); + ASSERT_TRUE(storage_info->GetBlobFiles().empty()); + } + + // Reopen the database with blob files enabled and write buffer size set to a + // smaller value. Multiple table files+blob files should be written and added + // to the Version during recovery. + Options options; + options.write_buffer_size = 1 << 16; // 64 KB + options.enable_blob_files = true; + options.avoid_flush_during_recovery = false; + options.disable_auto_compactions = true; + options.env = env_; + + Reopen(options); + + for (int i = 0; i < num_keys; ++i) { + ASSERT_EQ(Get(Key(i)), large_value); + } + + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); + ASSERT_NE(versions, nullptr); + + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + ASSERT_NE(cfd, nullptr); + + Version* const current = cfd->current(); + ASSERT_NE(current, nullptr); + + const VersionStorageInfo* const storage_info = current->storage_info(); + ASSERT_NE(storage_info, nullptr); + + const auto& l0_files = storage_info->LevelFiles(0); + ASSERT_GT(l0_files.size(), 1); + + const auto& blob_files = storage_info->GetBlobFiles(); + ASSERT_GT(blob_files.size(), 1); + + ASSERT_EQ(l0_files.size(), blob_files.size()); +} + +TEST_F(DBWALTest, WALWithChecksumHandoff) { +#ifndef ROCKSDB_ASSERT_STATUS_CHECKED + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + do { + Options options = CurrentOptions(); + + options.checksum_handoff_file_types.Add(FileType::kWalFile); + options.env = fault_fs_env.get(); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Both value's should be present. + ASSERT_EQ("v2", Get(1, "bar")); + ASSERT_EQ("v2", Get(1, "foo")); + + writeOpt.disableWAL = true; + // This put, data is persisted by Flush + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + writeOpt.disableWAL = false; + // Data is persisted in the WAL + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3")); + // The hash does not match, write fails + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash); + writeOpt.disableWAL = false; + ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); + + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // Due to the write failure, Get should not find + ASSERT_NE("v3", Get(1, "foo")); + ASSERT_EQ("v3", Get(1, "zoo")); + ASSERT_EQ("v3", Get(1, "bar")); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + // Each write will be similated as corrupted. + fault_fs->IngestDataCorruptionBeforeWrite(); + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4")); + writeOpt.disableWAL = false; + ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_NE("v4", Get(1, "foo")); + ASSERT_NE("v4", Get(1, "bar")); + fault_fs->NoDataCorruptionBeforeWrite(); + + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum); + // The file system does not provide checksum method and verification. + writeOpt.disableWAL = true; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5")); + writeOpt.disableWAL = false; + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ("v5", Get(1, "foo")); + ASSERT_EQ("v5", Get(1, "bar")); + + Destroy(options); + } while (ChangeWalOptions()); +#endif // ROCKSDB_ASSERT_STATUS_CHECKED +} + class DBRecoveryTestBlobError : public DBWALTest, public testing::WithParamInterface { public: - DBRecoveryTestBlobError() : fault_injection_env_(env_) {} - ~DBRecoveryTestBlobError() { Close(); } + DBRecoveryTestBlobError() : sync_point_(GetParam()) {} - FaultInjectionTestEnv fault_injection_env_; + std::string sync_point_; }; INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError, @@ -459,20 +631,19 @@ TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) { // Reopen with blob files enabled but make blob file writing fail during // recovery. - SyncPoint::GetInstance()->SetCallBack(GetParam(), [this](void* /* arg */) { - fault_injection_env_.SetFilesystemActive(false, Status::IOError()); + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) { + Status* const s = static_cast(arg); + assert(s); + + (*s) = Status::IOError(sync_point_); }); - SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", [this](void* /* arg */) { - fault_injection_env_.SetFilesystemActive(true); - }); SyncPoint::GetInstance()->EnableProcessing(); Options options; options.enable_blob_files = true; options.avoid_flush_during_recovery = false; options.disable_auto_compactions = true; - options.env = &fault_injection_env_; + options.env = env_; ASSERT_NOK(TryReopen(options)); @@ -500,13 +671,11 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) { do { // delete old files in backup_logs directory - env_->CreateDirIfMissing(backup_logs); + ASSERT_OK(env_->CreateDirIfMissing(backup_logs)); std::vector old_files; - env_->GetChildren(backup_logs, &old_files); + ASSERT_OK(env_->GetChildren(backup_logs, &old_files)); for (auto& file : old_files) { - if (file != "." && file != "..") { - env_->DeleteFile(backup_logs + "/" + file); - } + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file)); } Options options = CurrentOptions(); options.create_if_missing = true; @@ -524,11 +693,9 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) { // copy the logs to backup std::vector logs; - env_->GetChildren(options.wal_dir, &logs); + ASSERT_OK(env_->GetChildren(options.wal_dir, &logs)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); - } + CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log); } // recover the DB @@ -539,9 +706,7 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) { // copy the logs from backup back to wal dir for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); } // this should ignore the log files, recovery should not happen again // if the recovery happens, the same merge operator would be called twice, @@ -555,11 +720,9 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) { Close(); // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); + ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - } + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); } // assert that we successfully recovered only from logs, even though we // destroyed the DB @@ -570,16 +733,14 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) { // Recovery will fail if DB directory doesn't exist. Destroy(options); // copy the logs from backup back to wal dir - env_->CreateDirIfMissing(options.wal_dir); + ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir)); for (auto& log : logs) { - if (log != ".." && log != ".") { - CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); - // we won't be needing this file no more - env_->DeleteFile(backup_logs + "/" + log); - } + CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log); + // we won't be needing this file no more + ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log)); } Status s = TryReopen(options); - ASSERT_TRUE(!s.ok()); + ASSERT_NOK(s); Destroy(options); } while (ChangeWalOptions()); } @@ -617,9 +778,9 @@ TEST_F(DBWALTest, PreallocateBlock) { called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -636,9 +797,9 @@ TEST_F(DBWALTest, PreallocateBlock) { called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -656,9 +817,9 @@ TEST_F(DBWALTest, PreallocateBlock) { called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -677,9 +838,9 @@ TEST_F(DBWALTest, PreallocateBlock) { called.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - Put("", ""); - Flush(); - Put("", ""); + ASSERT_OK(Put("", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("", "")); Close(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(2, called.load()); @@ -905,7 +1066,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) { // Make 'dobrynia' to be flushed and new WAL file to be created ASSERT_OK(Put(2, Key(10), DummyString(7500000))); ASSERT_OK(Put(2, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); { auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), static_cast(1)); @@ -959,7 +1120,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) { // Make 'nikitich' memtable to be flushed ASSERT_OK(Put(3, Key(10), DummyString(1002400))); ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); // 4 memtable are not flushed, 1 sst file { auto tables = ListTableFiles(env_, dbname_); @@ -979,7 +1140,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) { ASSERT_OK(Put(3, Key(10), DummyString(1002400))); // make it flush ASSERT_OK(Put(3, Key(1), DummyString(1))); - dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); // There are still 4 memtable not flushed, and 2 sst tables ASSERT_OK(Put(0, Key(1), DummyString(1))); ASSERT_OK(Put(1, Key(1), DummyString(1))); @@ -1027,10 +1188,10 @@ TEST_F(DBWALTest, SyncMultipleLogs) { for (uint64_t b = 0; b < kNumBatches; b++) { batch.Clear(); for (int i = 0; i < kBatchSize; i++) { - batch.Put(Key(i), DummyString(128)); + ASSERT_OK(batch.Put(Key(i), DummyString(128))); } - dbfull()->Write(wo, &batch); + ASSERT_OK(dbfull()->Write(wo, &batch)); } ASSERT_OK(dbfull()->SyncWAL()); @@ -1058,7 +1219,7 @@ TEST_F(DBWALTest, PartOfWritesWithWALDisabled) { ASSERT_OK(Flush(0)); ASSERT_OK(Put(0, "key", "v5", wal_on)); // seq id 5 ASSERT_EQ("v5", Get(0, "key")); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->FlushWAL(false)); // Simulate a crash. fault_env->SetFilesystemActive(false); Close(); @@ -1094,30 +1255,31 @@ class RecoveryTestHelper { *count = 0; std::shared_ptr table_cache = NewLRUCache(50, 0); - EnvOptions env_options; + FileOptions file_options; WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); std::unique_ptr versions; std::unique_ptr wal_manager; WriteController write_controller; - versions.reset(new VersionSet( - test->dbname_, &db_options, env_options, table_cache.get(), - &write_buffer_manager, &write_controller, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)); + versions.reset(new VersionSet(test->dbname_, &db_options, file_options, + table_cache.get(), &write_buffer_manager, + &write_controller, + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ "")); wal_manager.reset( - new WalManager(db_options, env_options, /*io_tracer=*/nullptr)); + new WalManager(db_options, file_options, /*io_tracer=*/nullptr)); std::unique_ptr current_log_writer; for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) { uint64_t current_log_number = j; std::string fname = LogFileName(test->dbname_, current_log_number); - std::unique_ptr file; - ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), fname, env_options)); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(), + fname, file_options, &file_writer, + nullptr)); current_log_writer.reset( new log::Writer(std::move(file_writer), current_log_number, db_options.recycle_log_file_num > 0)); @@ -1126,12 +1288,13 @@ class RecoveryTestHelper { for (int i = 0; i < kKeysPerWALFile; i++) { std::string key = "key" + ToString((*count)++); std::string value = test->DummyString(kValueSize); - assert(current_log_writer.get() != nullptr); + ASSERT_NE(current_log_writer.get(), nullptr); uint64_t seq = versions->LastSequence() + 1; batch.Clear(); - batch.Put(key, value); + ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); - current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch)); + ASSERT_OK(current_log_writer->AddRecord( + WriteBatchInternal::Contents(&batch))); versions->SetLastAllocatedSequence(seq); versions->SetLastPublishedSequence(seq); versions->SetLastSequence(seq); @@ -1179,32 +1342,13 @@ class RecoveryTestHelper { test->Close(); #endif if (trunc) { - ASSERT_EQ(0, truncate(fname.c_str(), static_cast(size * off))); + ASSERT_OK( + test::TruncateFile(env, fname, static_cast(size * off))); } else { - InduceCorruption(fname, static_cast(size * off + 8), - static_cast(size * len)); + ASSERT_OK(test::CorruptFile(env, fname, static_cast(size * off + 8), + static_cast(size * len), false)); } } - - // Overwrite data with 'a' from offset for length len - static void InduceCorruption(const std::string& filename, size_t offset, - size_t len) { - ASSERT_GT(len, 0U); - - int fd = open(filename.c_str(), O_RDWR); - - // On windows long is 32-bit - ASSERT_LE(offset, std::numeric_limits::max()); - - ASSERT_GT(fd, 0); - ASSERT_EQ(offset, lseek(fd, static_cast(offset), SEEK_SET)); - - void* buf = alloca(len); - memset(buf, 'b', len); - ASSERT_EQ(len, write(fd, buf, static_cast(len))); - - close(fd); - } }; class DBWALTestWithParams @@ -1326,11 +1470,11 @@ TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) { ASSERT_OK(Put(1, "key3", "val3")); // Corrupt WAL at location of key3 - RecoveryTestHelper::InduceCorruption( - fname, static_cast(offset_to_corrupt), static_cast(4)); + ASSERT_OK(test::CorruptFile(env, fname, static_cast(offset_to_corrupt), + 4, false)); ASSERT_OK(Put(2, "key4", "val4")); ASSERT_OK(Put(1, "key5", "val5")); - Flush(2); + ASSERT_OK(Flush(2)); // PIT recovery & verify options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; @@ -1484,7 +1628,7 @@ TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) { for (int i = 0; i < 2; ++i) { if (i > 0) { // Flush() triggers deletion of obsolete tracked files - Flush(); + ASSERT_OK(Flush()); } VectorLogPtr log_files; ASSERT_OK(dbfull()->GetSortedWalFiles(log_files)); @@ -1526,7 +1670,7 @@ TEST_F(DBWALTest, RecoverWithoutFlush) { ASSERT_EQ(Get("foo"), "foo_v2"); ASSERT_EQ(Get("bar"), "bar_v2"); // manual flush and insert again - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ(Get("foo"), "foo_v2"); ASSERT_EQ(Get("bar"), "bar_v2"); ASSERT_OK(Put("foo", "foo_v3")); @@ -1547,7 +1691,9 @@ TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) { auto countWalFiles = [this]() { VectorLogPtr log_files; - dbfull()->GetSortedWalFiles(log_files); + if (!dbfull()->GetSortedWalFiles(log_files).ok()) { + return size_t{0}; + } return log_files.size(); }; @@ -1555,11 +1701,11 @@ TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) { CreateAndReopenWithCF({"one", "two"}, options); ASSERT_OK(Put(0, "key1", kSmallValue)); ASSERT_OK(Put(1, "key2", kLargeValue)); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_EQ(1, countWalFiles()); ASSERT_OK(Put(0, "key3", kSmallValue)); ASSERT_OK(Put(2, "key4", kLargeValue)); - Flush(2); + ASSERT_OK(Flush(2)); ASSERT_EQ(2, countWalFiles()); // Reopen, insert and flush. @@ -1573,9 +1719,9 @@ TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) { ASSERT_OK(Put(0, "key5", kLargeValue)); ASSERT_OK(Put(1, "key6", kLargeValue)); ASSERT_EQ(3, countWalFiles()); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(Put(2, "key7", kLargeValue)); - dbfull()->FlushWAL(false); + ASSERT_OK(dbfull()->FlushWAL(false)); ASSERT_EQ(4, countWalFiles()); // Reopen twice and validate. @@ -1656,19 +1802,8 @@ TEST_P(DBWALTestWithParamsVaryingRecoveryMode, // avoid_flush_during_recovery=true. // Flush should trigger if max_total_wal_size is reached. TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { - class TestFlushListener : public EventListener { - public: - std::atomic count{0}; - - TestFlushListener() = default; - - void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override { - count++; - assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason); - } - }; - std::shared_ptr test_listener = - std::make_shared(); + auto test_listener = std::make_shared(); + test_listener->expected_flush_reason = FlushReason::kWalFull; constexpr size_t kKB = 1024; constexpr size_t kMB = 1024 * 1024; @@ -1708,7 +1843,9 @@ TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { 1 * kMB); // Write one more key to trigger flush. ASSERT_OK(Put(0, "foo", "v2")); - dbfull()->TEST_WaitForFlushMemTable(); + for (auto* h : handles_) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h)); + } // Flushed two column families. ASSERT_EQ(2, test_listener->count.load()); } @@ -1720,24 +1857,15 @@ TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) { TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { constexpr size_t kKB = 1024; Options options = CurrentOptions(); + options.env = env_; options.avoid_flush_during_recovery = true; - // Test fallocate support of running file system. - // Skip this test if fallocate is not supported. - std::string fname_test_fallocate = dbname_ + "/preallocate_testfile"; - int fd = -1; - do { - fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); - } while (fd < 0 && errno == EINTR); - ASSERT_GT(fd, 0); - int alloc_status = fallocate(fd, 0, 0, 1); - int err_number = errno; - close(fd); - ASSERT_OK(options.env->DeleteFile(fname_test_fallocate)); - if (err_number == ENOSYS || err_number == EOPNOTSUPP) { - fprintf(stderr, "Skipped preallocated space check: %s\n", strerror(err_number)); + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { return; } - ASSERT_EQ(0, alloc_status); DestroyAndReopen(options); size_t preallocated_size = @@ -1760,6 +1888,175 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) { ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), preallocated_size); } +// Tests that we will truncate the preallocated space of the last log from +// previous. +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + options.avoid_flush_during_shutdown = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + // The log file has preallocated space. + Close(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // After the flush during Open, the log file should get deleted. However, + // if the process is in a crash loop, the log file may not get + // deleted and thte preallocated space will keep accumulating. So we need + // to ensure it gets trtuncated. + EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()), + preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) { + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = false; + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + Close(); + std::vector filenames; + std::string last_log; + uint64_t last_log_num = 0; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (auto fname : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(fname, &number, &type, nullptr)) { + if (type == kWalFile && number > last_log_num) { + last_log = fname; + } + } + } + ASSERT_NE(last_log, ""); + last_log = dbname_ + '/' + last_log; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:Begin", + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"}, + {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate", + "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", + [](void* arg) { *(reinterpret_cast(arg)) = 0; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Preallocate space for the empty log file. This could happen if WAL data + // was buffered in memory and the process crashed. + std::unique_ptr log_file; + ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions())); + log_file->SetPreallocationBlockSize(preallocated_size); + log_file->PrepareWrite(0, 4096); + log_file.reset(); + + ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size); + + port::Thread reopen_thread([&]() { Reopen(options); }); + + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"); + // The preallocated space should be truncated. + EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size); + TEST_SYNC_POINT( + "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate"); + reopen_thread.join(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) { + constexpr size_t kKB = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.avoid_flush_during_recovery = true; + if (mem_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem environment"); + return; + } + if (!IsFallocateSupported()) { + return; + } + + // create DB and close with file truncate disabled + std::atomic_bool enable_truncate{false}; + + SyncPoint::GetInstance()->SetCallBack( + "PosixWritableFile::Close", [&](void* arg) { + if (!enable_truncate) { + *(reinterpret_cast(arg)) = 0; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + size_t preallocated_size = + dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size); + ASSERT_OK(Put("foo", "v1")); + VectorLogPtr log_files_before; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before)); + ASSERT_EQ(1, log_files_before.size()); + auto& file_before = log_files_before[0]; + ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB); + // The log file has preallocated space. + auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName()); + ASSERT_GE(db_size, preallocated_size); + Close(); + + // enable truncate and open DB as readonly, the file should not be truncated + // and DB size is not changed. + enable_truncate = true; + ASSERT_OK(ReadOnlyReopen(options)); + VectorLogPtr log_files_after; + ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after)); + ASSERT_EQ(1, log_files_after.size()); + ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB); + ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName()); + // The preallocated space should NOT be truncated. + // the DB size is almost the same. + ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size, + db_size / 100); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} #endif // ROCKSDB_FALLOCATE_PRESENT #endif // ROCKSDB_PLATFORM_POSIX @@ -1777,9 +2074,9 @@ TEST_F(DBWALTest, WalTermTest) { wo.disableWAL = false; WriteBatch batch; - batch.Put("foo", "bar"); + ASSERT_OK(batch.Put("foo", "bar")); batch.MarkWalTerminationPoint(); - batch.Put("foo2", "bar2"); + ASSERT_OK(batch.Put("foo2", "bar2")); ASSERT_OK(dbfull()->Write(wo, &batch)); diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index a0984bc2ff7..a227eb9395d 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -32,6 +32,22 @@ class DBBasicTestWithTimestampBase : public DBTestBase { return ret; } + static std::string KeyWithPrefix(std::string prefix, uint64_t k) { + std::string ret; + PutFixed64(&ret, k); + std::reverse(ret.begin(), ret.end()); + return prefix + ret; + } + + static std::vector ConvertStrToSlice( + std::vector& strings) { + std::vector ret; + for (const auto& s : strings) { + ret.emplace_back(s); + } + return ret; + } + class TestComparator : public Comparator { private: const Comparator* cmp_without_ts_; @@ -141,7 +157,8 @@ class DBBasicTestWithTimestampBase : public DBTestBase { ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size()); ukey_and_ts.append(expected_ts.data(), expected_ts.size()); ParsedInternalKey parsed_ikey; - ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey)); + ASSERT_OK( + ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */)); ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key); ASSERT_EQ(expected_val_type, parsed_ikey.type); ASSERT_EQ(expected_seq, parsed_ikey.sequence); @@ -161,7 +178,8 @@ class DBBasicTestWithTimestampBase : public DBTestBase { ukey_and_ts.append(expected_ts.data(), expected_ts.size()); ParsedInternalKey parsed_ikey; - ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey)); + ASSERT_OK( + ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */)); ASSERT_EQ(expected_val_type, parsed_ikey.type); ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key); if (expected_val_type == kTypeValue) { @@ -177,7 +195,218 @@ class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {} }; -TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) { +TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar")); + ASSERT_OK(Flush()); + + std::string start_str = "foo"; + std::string end_str = "foo2"; + Slice start(start_str), end(end_str); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + const std::string kKey = "test kKey"; + + // Test set ts_low first and flush() + int current_ts_low = 5; + std::string ts_low_str = Timestamp(current_ts_low, 0); + Slice ts_low = ts_low_str; + CompactRangeOptions comp_opts; + comp_opts.full_history_ts_low = &ts_low; + comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr)); + + auto* cfd = + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(); + auto result_ts_low = cfd->GetFullHistoryTsLow(); + + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0); + + for (int i = 0; i < 10; i++) { + WriteOptions write_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, kKey, Key(i))); + } + ASSERT_OK(Flush()); + + for (int i = 0; i < 10; i++) { + ReadOptions read_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status status = db_->Get(read_opts, kKey, &value); + if (i < current_ts_low) { + ASSERT_TRUE(status.IsNotFound()); + } else { + ASSERT_OK(status); + ASSERT_TRUE(value.compare(Key(i)) == 0); + } + } + + // Test set ts_low and then trigger compaction + for (int i = 10; i < 20; i++) { + WriteOptions write_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, kKey, Key(i))); + } + + ASSERT_OK(Flush()); + + current_ts_low = 15; + ts_low_str = Timestamp(current_ts_low, 0); + ts_low = ts_low_str; + comp_opts.full_history_ts_low = &ts_low; + ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr)); + result_ts_low = cfd->GetFullHistoryTsLow(); + ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0); + + for (int i = 0; i < 20; i++) { + ReadOptions read_opts; + std::string ts_str = Timestamp(i, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::string value; + Status status = db_->Get(read_opts, kKey, &value); + if (i < current_ts_low) { + ASSERT_TRUE(status.IsNotFound()); + } else { + ASSERT_OK(status); + ASSERT_TRUE(value.compare(Key(i)) == 0); + } + } + + // Test invalid compaction with range + Slice start(kKey), end(kKey); + Status s = db_->CompactRange(comp_opts, &start, &end); + ASSERT_TRUE(s.IsInvalidArgument()); + s = db_->CompactRange(comp_opts, &start, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + s = db_->CompactRange(comp_opts, nullptr, &end); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Test invalid compaction with the decreasing ts_low + ts_low_str = Timestamp(current_ts_low - 1, 0); + ts_low = ts_low_str; + comp_opts.full_history_ts_low = &ts_low; + s = db_->CompactRange(comp_opts, nullptr, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000000; // Large write buffer + options.compression = kNoCompression; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + const int N = 128; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(db_->Put(write_opts, Key(i), rnd.RandomString(1024))); + } + + uint64_t size; + std::string start = Key(50); + std::string end = Key(60); + Range r(start, end); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = true; + size_approx_options.include_files = true; + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 6000); + ASSERT_LT(size, 204800); + + // test multiple ranges + std::vector ranges; + std::string start_tmp = Key(10); + std::string end_tmp = Key(20); + ranges.emplace_back(Range(start_tmp, end_tmp)); + ranges.emplace_back(Range(start, end)); + uint64_t range_sizes[2]; + ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, + ranges.data(), 2, range_sizes)); + + ASSERT_EQ(range_sizes[1], size); + + // Zero if not including mem table + ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size)); + ASSERT_EQ(size, 0); + + start = Key(500); + end = Key(600); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + // Test range boundaries + ASSERT_OK(db_->Put(write_opts, Key(1000), rnd.RandomString(1024))); + // Should include start key + start = Key(1000); + end = Key(1100); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_GT(size, 0); + + // Should exclude end key + start = Key(900); + end = Key(1000); + r = Range(start, end); + ASSERT_OK( + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); + ASSERT_EQ(size, 0); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleIterate) { const int kNumKeysPerFile = 128; const uint64_t kMaxKey = 1024; Options options = CurrentOptions(); @@ -209,6 +438,7 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) { std::unique_ptr it(db_->NewIterator(read_opts)); int count = 0; uint64_t key = 0; + // Forward iterate. for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid(); it->Next(), ++count, ++key) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, @@ -217,7 +447,16 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) { size_t expected_count = kMaxKey - start_keys[i] + 1; ASSERT_EQ(expected_count, count); - // SeekToFirst() with lower bound. + // Backward iterate. + count = 0; + for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid(); + it->Prev(), ++count, --key) { + CheckIterUserEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } + ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); + + // SeekToFirst()/SeekToLast() with lower/upper bounds. // Then iter with lower and upper bounds. uint64_t l = 0; uint64_t r = kMaxKey + 1; @@ -235,6 +474,12 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) { "value" + std::to_string(i), write_timestamps[i]); } ASSERT_EQ(r - std::max(l, start_keys[i]), count); + + for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; + it->Valid(); it->Prev(), --key, ++count) { + CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + } l += (kMaxKey / 100); r -= (kMaxKey / 100); } @@ -242,256 +487,917 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) { Close(); } -TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) { - constexpr int kNumKeysPerFile = 128; - constexpr uint64_t kMaxKey = 1024; - Options options = CurrentOptions(); - options.env = env_; +class DBBasicTestWithTimestampTableOptions + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + explicit DBBasicTestWithTimestampTableOptions() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_table_options") {} +}; + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampTableOptions, + testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)); + +TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { + Options options = GetDefaultOptions(); options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.compression = kNoCompression; + BlockBasedTableOptions bbto; + bbto.index_type = GetParam(); + bbto.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); const size_t kTimestampSize = Timestamp(0, 0).size(); - TestComparator test_cmp(kTimestampSize); - options.comparator = &test_cmp; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + TestComparator cmp(kTimestampSize); + options.comparator = &cmp; DestroyAndReopen(options); - const std::vector write_timestamps = {Timestamp(1, 0), - Timestamp(3, 0)}; - const std::vector read_timestamps = {Timestamp(2, 0), - Timestamp(4, 0)}; - const std::vector read_timestamps_lb = {Timestamp(1, 0), - Timestamp(1, 0)}; - for (size_t i = 0; i < write_timestamps.size(); ++i) { + constexpr uint64_t kNumKeys = 1024; + for (uint64_t k = 0; k < kNumKeys; ++k) { WriteOptions write_opts; - Slice write_ts = write_timestamps[i]; - write_opts.timestamp = &write_ts; - for (uint64_t key = 0; key <= kMaxKey; ++key) { - Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i)); - ASSERT_OK(s); - } + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, Key1(k), "value" + std::to_string(k))); } - for (size_t i = 0; i < read_timestamps.size(); ++i) { + ASSERT_OK(Flush()); + { ReadOptions read_opts; - Slice read_ts = read_timestamps[i]; - Slice read_ts_lb = read_timestamps_lb[i]; - read_opts.timestamp = &read_ts; - read_opts.iter_start_ts = &read_ts_lb; + read_opts.total_order_seek = true; + std::string ts_str = Timestamp(2, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; std::unique_ptr it(db_->NewIterator(read_opts)); - int count = 0; - uint64_t key = 0; - for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { - CheckIterEntry(it.get(), Key1(key), kTypeValue, - "value" + std::to_string(i), write_timestamps[i]); - if (i > 0) { - it->Next(); - CheckIterEntry(it.get(), Key1(key), kTypeValue, - "value" + std::to_string(i - 1), - write_timestamps[i - 1]); + // verify Get() + for (it->SeekToFirst(); it->Valid(); it->Next()) { + std::string value_from_get; + std::string key_str(it->key().data(), it->key().size()); + std::string timestamp; + ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, ×tamp)); + ASSERT_EQ(it->value(), value_from_get); + ASSERT_EQ(Timestamp(1, 0), timestamp); + } + + // verify MultiGet() + constexpr uint64_t step = 2; + static_assert(0 == (kNumKeys % step), + "kNumKeys must be a multiple of step"); + for (uint64_t k = 0; k < kNumKeys; k += 2) { + std::vector key_strs; + std::vector keys; + for (size_t i = 0; i < step; ++i) { + key_strs.push_back(Key1(k + i)); + } + for (size_t i = 0; i < step; ++i) { + keys.emplace_back(key_strs[i]); + } + std::vector values; + std::vector timestamps; + std::vector statuses = + db_->MultiGet(read_opts, keys, &values, ×tamps); + ASSERT_EQ(step, statuses.size()); + ASSERT_EQ(step, values.size()); + ASSERT_EQ(step, timestamps.size()); + for (uint64_t i = 0; i < step; ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ("value" + std::to_string(k + i), values[i]); + ASSERT_EQ(Timestamp(1, 0), timestamps[i]); } } - size_t expected_count = kMaxKey + 1; - ASSERT_EQ(expected_count, count); } - // Delete all keys@ts=5 and check iteration result with start ts set + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(3)); + options.memtable_whole_key_filtering = true; + options.memtable_prefix_bloom_size_ratio = 0.1; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo3", "bar")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + std::string read_ts = Timestamp(1, 0); + ts = read_ts; + read_opts.timestamp = &ts; { - std::string write_timestamp = Timestamp(5, 0); - WriteOptions write_opts; - Slice write_ts = write_timestamp; - write_opts.timestamp = &write_ts; - for (uint64_t key = 0; key < kMaxKey + 1; ++key) { - Status s = db_->Delete(write_opts, Key1(key)); - ASSERT_OK(s); - } + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); - std::string read_timestamp = Timestamp(6, 0); - ReadOptions read_opts; - Slice read_ts = read_timestamp; - read_opts.timestamp = &read_ts; - std::string read_timestamp_lb = Timestamp(2, 0); - Slice read_ts_lb = read_timestamp_lb; - read_opts.iter_start_ts = &read_ts_lb; - std::unique_ptr it(db_->NewIterator(read_opts)); - int count = 0; - uint64_t key = 0; - for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { - CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(), - write_ts); - // Skip key@ts=3 and land on tombstone key@ts=5 - it->Next(); - } - ASSERT_EQ(kMaxKey + 1, count); + iter->Seek("bbb"); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLongerThanKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(20)); + options.memtable_whole_key_filtering = true; + options.memtable_prefix_bloom_size_ratio = 0.1; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo3", "bar")); + ASSERT_OK(Flush()); + + ReadOptions read_opts; + std::string read_ts = Timestamp(2, 0); + ts = read_ts; + read_opts.timestamp = &ts; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + // Make sure the prefix extractor doesn't include timestamp, otherwise it + // may return invalid result. + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); } + + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo1", "bar1")); + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(write_opts, "foo2", "bar2")); + ASSERT_OK(Flush()); + + // Move sst file to next level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (int i = 3; i < 9; ++i) { + ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), + "bar" + std::to_string(i))); + } + ASSERT_OK(Flush()); + + ReadOptions read_opts; + std::string read_ts = Timestamp(2, 0); + ts = read_ts; + read_opts.timestamp = &ts; + std::string up_bound = "foo5"; // exclusive + Slice up_bound_slice = up_bound; + std::string lo_bound = "foo2"; // inclusive + Slice lo_bound_slice = lo_bound; + read_opts.iterate_upper_bound = &up_bound_slice; + read_opts.iterate_lower_bound = &lo_bound_slice; + read_opts.auto_prefix_mode = true; + { + std::unique_ptr iter(db_->NewIterator(read_opts)); + // Make sure the prefix extractor doesn't include timestamp, otherwise it + // may return invalid result. + iter->Seek("foo"); + CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2", + Timestamp(1, 0)); + iter->SeekToFirst(); + CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2", + Timestamp(1, 0)); + iter->SeekForPrev("g"); + CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0)); + iter->SeekToLast(); + CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0)); + } + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env_; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + DestroyAndReopen(options); + const std::vector timestamps = {Timestamp(1, 1), Timestamp(0, 2), + Timestamp(4, 3)}; + const std::vector> kvs = { + std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")}; + for (const auto& ts : timestamps) { + WriteBatch wb(0, 0, kTimestampSize); + for (const auto& kv : kvs) { + const std::string& key = std::get<0>(kv); + const std::string& value = std::get<1>(kv); + ASSERT_OK(wb.Put(key, value)); + } + + ASSERT_OK(wb.AssignTimestamp(ts)); + ASSERT_OK(db_->Write(WriteOptions(), &wb)); + } + std::string read_ts_str = Timestamp(5, 3); + Slice read_ts = read_ts_str; + ReadOptions read_opts; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + + it->SeekToFirst(); + ASSERT_TRUE(it->Valid()); + it->Prev(); + ASSERT_FALSE(it->Valid()); + + it->SeekToLast(); + ASSERT_TRUE(it->Valid()); + uint64_t prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + ASSERT_EQ(0, prev_reseek_count); + it->Next(); + ASSERT_FALSE(it->Valid()); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it->Seek(std::get<0>(kvs[0])); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + it->Prev(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + + prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + ASSERT_EQ(1, prev_reseek_count); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it->SeekForPrev(std::get<0>(kvs[1])); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + it->Prev(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue, + std::get<1>(kvs[0]), Timestamp(4, 3)); + + prev_reseek_count = + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION); + it->Next(); + CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue, + std::get<1>(kvs[1]), Timestamp(4, 3)); + ASSERT_EQ(1 + prev_reseek_count, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + it.reset(); + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) { + constexpr int kNumKeysPerFile = 128; + constexpr uint64_t kMaxKey = 1024; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + const std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(3, 0)}; + const std::vector read_timestamps = {Timestamp(2, 0), + Timestamp(4, 0)}; + const std::vector read_timestamps_lb = {Timestamp(1, 0), + Timestamp(1, 0)}; + for (size_t i = 0; i < write_timestamps.size(); ++i) { + WriteOptions write_opts; + Slice write_ts = write_timestamps[i]; + write_opts.timestamp = &write_ts; + for (uint64_t key = 0; key <= kMaxKey; ++key) { + Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i)); + ASSERT_OK(s); + } + } + for (size_t i = 0; i < read_timestamps.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_timestamps[i]; + Slice read_ts_lb = read_timestamps_lb[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { + CheckIterEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i), write_timestamps[i]); + if (i > 0) { + it->Next(); + CheckIterEntry(it.get(), Key1(key), kTypeValue, + "value" + std::to_string(i - 1), + write_timestamps[i - 1]); + } + } + size_t expected_count = kMaxKey + 1; + ASSERT_EQ(expected_count, count); + } + // Delete all keys@ts=5 and check iteration result with start ts set + { + std::string write_timestamp = Timestamp(5, 0); + WriteOptions write_opts; + Slice write_ts = write_timestamp; + write_opts.timestamp = &write_ts; + for (uint64_t key = 0; key < kMaxKey + 1; ++key) { + Status s = db_->Delete(write_opts, Key1(key)); + ASSERT_OK(s); + } + + std::string read_timestamp = Timestamp(6, 0); + ReadOptions read_opts; + Slice read_ts = read_timestamp; + read_opts.timestamp = &read_ts; + std::string read_timestamp_lb = Timestamp(2, 0); + Slice read_ts_lb = read_timestamp_lb; + read_opts.iter_start_ts = &read_ts_lb; + std::unique_ptr it(db_->NewIterator(read_opts)); + int count = 0; + uint64_t key = 0; + for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) { + CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(), + write_ts); + // Skip key@ts=3 and land on tombstone key@ts=5 + it->Next(); + } + ASSERT_EQ(kMaxKey + 1, count); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ForwardIterateStartSeqnum) { + const int kNumKeysPerFile = 128; + const uint64_t kMaxKey = 0xffffffffffffffff; + const uint64_t kMinKey = kMaxKey - 1023; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + // Need to disable compaction to bottommost level when sequence number will be + // zeroed out, causing the verification of sequence number to fail in this + // test. + options.disable_auto_compactions = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + DestroyAndReopen(options); + std::vector start_seqs; + + const int kNumTimestamps = 4; + std::vector write_ts_list; + for (int t = 0; t != kNumTimestamps; ++t) { + write_ts_list.push_back(Timestamp(2 * t, /*do not care*/ 17)); + } + WriteOptions write_opts; + for (size_t i = 0; i != write_ts_list.size(); ++i) { + Slice write_ts = write_ts_list[i]; + write_opts.timestamp = &write_ts; + for (uint64_t k = kMaxKey; k >= kMinKey; --k) { + Status s; + if (k % 2) { + s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i)); + } else { + s = db_->Delete(write_opts, Key1(k)); + } + ASSERT_OK(s); + } + start_seqs.push_back(db_->GetLatestSequenceNumber()); + } + std::vector read_ts_list; + for (int t = 0; t != kNumTimestamps - 1; ++t) { + read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17)); + } + + ReadOptions read_opts; + // Scan with only read_opts.iter_start_seqnum set. + for (size_t i = 0; i != read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_seqnum = start_seqs[i] + 1; + std::unique_ptr iter(db_->NewIterator(read_opts)); + SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1; + uint64_t key = kMinKey; + for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) { + CheckIterEntry( + iter.get(), Key1(key), expected_seq, + (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp, + (key % 2) ? "value" + std::to_string(i + 1) : std::string(), + write_ts_list[i + 1]); + ++key; + --expected_seq; + } + } + // Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set. + std::vector read_ts_lb_list; + for (int t = 0; t < kNumTimestamps - 1; ++t) { + read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17)); + } + for (size_t i = 0; i < read_ts_list.size(); ++i) { + Slice read_ts = read_ts_list[i]; + Slice read_ts_lb = read_ts_lb_list[i]; + read_opts.timestamp = &read_ts; + read_opts.iter_start_ts = &read_ts_lb; + read_opts.iter_start_seqnum = start_seqs[i] + 1; + std::unique_ptr it(db_->NewIterator(read_opts)); + uint64_t key = kMinKey; + SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1; + for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) { + CheckIterEntry(it.get(), Key1(key), expected_seq, + (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp, + "value" + std::to_string(i + 1), write_ts_list[i + 1]); + ++key; + --expected_seq; + } + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Insert kNumKeys + WriteOptions write_opts; + Status s; + for (size_t i = 0; i != kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "foo", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToFirst(); + CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + + ts_str = Timestamp(kNumKeys, 0); + ts = ts_str; + read_opts.timestamp = &ts; + iter.reset(db_->NewIterator(read_opts)); + iter->SeekToLast(); + CheckIterUserEntry(iter.get(), "foo", kTypeValue, + "value" + std::to_string(kNumKeys - 1), ts_str); + ASSERT_EQ( + 2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // Write kNumKeys + 1 keys + WriteOptions write_opts; + Status s; + for (size_t i = 0; i != kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "a", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + WriteBatch batch(0, 0, kTimestampSize); + ASSERT_OK(batch.Put("a", "new_value")); + ASSERT_OK(batch.Put("b", "new_value")); + s = batch.AssignTimestamp(ts_str); + ASSERT_OK(s); + s = db_->Write(write_opts, &batch); + ASSERT_OK(s); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->Seek("a"); + iter->Next(); + CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) { + Options options = GetDefaultOptions(); + options.env = env_; + options.create_if_missing = true; + constexpr size_t kNumKeys = 16; + options.max_sequential_skip_in_iterations = kNumKeys / 2; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + WriteOptions write_opts; + write_opts.timestamp = &ts; + Status s = db_->Put(write_opts, "b", "value" + std::to_string(i)); + ASSERT_OK(s); + } + { + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + WriteOptions write_opts; + write_opts.timestamp = &ts; + ASSERT_OK(db_->Put(write_opts, "a", "value")); + } + { + ReadOptions read_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + read_opts.timestamp = &ts; + std::unique_ptr iter(db_->NewIterator(read_opts)); + iter->SeekToLast(); + iter->Prev(); + CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str); + ASSERT_EQ( + 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + } + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); + Close(); +} + +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + // Read with MultiGet + ts_str = Timestamp(2, 0); + ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); Close(); } -TEST_F(DBBasicTestWithTimestamp, ForwardIterateStartSeqnum) { - const int kNumKeysPerFile = 128; - const uint64_t kMaxKey = 0xffffffffffffffff; - const uint64_t kMinKey = kMaxKey - 1023; +TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) { Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; - // Need to disable compaction to bottommost level when sequence number will be - // zeroed out, causing the verification of sequence number to fail in this - // test. - options.disable_auto_compactions = true; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); const size_t kTimestampSize = Timestamp(0, 0).size(); TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; - options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); DestroyAndReopen(options); - std::vector start_seqs; - const int kNumTimestamps = 4; - std::vector write_ts_list; - for (int t = 0; t != kNumTimestamps; ++t) { - write_ts_list.push_back(Timestamp(2 * t, /*do not care*/ 17)); - } + // Write any value WriteOptions write_opts; - for (size_t i = 0; i != write_ts_list.size(); ++i) { - Slice write_ts = write_ts_list[i]; - write_opts.timestamp = &write_ts; - for (uint64_t k = kMaxKey; k >= kMinKey; --k) { - Status s; - if (k % 2) { - s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i)); - } else { - s = db_->Delete(write_opts, Key1(k)); - } - ASSERT_OK(s); - } - start_seqs.push_back(db_->GetLatestSequenceNumber()); - } - std::vector read_ts_list; - for (int t = 0; t != kNumTimestamps - 1; ++t) { - read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17)); + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + // random data + for (int i = 0; i < 3; i++) { + auto key = ToString(i * 10); + auto value = ToString(i * 10); + Slice key_slice = key; + Slice value_slice = value; + ASSERT_OK(db_->Put(write_opts, key_slice, value_slice)); + ASSERT_OK(Flush()); } + // Make num_levels to 2 to do key range filtering of sst files + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + + // Read with MultiGet + ts_str = Timestamp(2, 0); + ts = ts_str; ReadOptions read_opts; - // Scan with only read_opts.iter_start_seqnum set. - for (size_t i = 0; i != read_ts_list.size(); ++i) { - Slice read_ts = read_ts_list[i]; - read_opts.timestamp = &read_ts; - read_opts.iter_start_seqnum = start_seqs[i] + 1; - std::unique_ptr iter(db_->NewIterator(read_opts)); - SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1; - uint64_t key = kMinKey; - for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) { - CheckIterEntry( - iter.get(), Key1(key), expected_seq, - (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp, - (key % 2) ? "value" + std::to_string(i + 1) : std::string(), - write_ts_list[i + 1]); - ++key; - --expected_seq; - } - } - // Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set. - std::vector read_ts_lb_list; - for (int t = 0; t < kNumTimestamps - 1; ++t) { - read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17)); - } - for (size_t i = 0; i < read_ts_list.size(); ++i) { - Slice read_ts = read_ts_list[i]; - Slice read_ts_lb = read_ts_lb_list[i]; - read_opts.timestamp = &read_ts; - read_opts.iter_start_ts = &read_ts_lb; - read_opts.iter_start_seqnum = start_seqs[i] + 1; - std::unique_ptr it(db_->NewIterator(read_opts)); - uint64_t key = kMinKey; - SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1; - for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) { - CheckIterEntry(it.get(), Key1(key), expected_seq, - (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp, - "value" + std::to_string(i + 1), write_ts_list[i + 1]); - ++key; - --expected_seq; - } - } + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + ASSERT_OK(statuses[0]); Close(); } -TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) { +TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) { Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; - constexpr size_t kNumKeys = 16; - options.max_sequential_skip_in_iterations = kNumKeys / 2; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.prefix_extractor.reset(NewCappedPrefixTransform(5)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.cache_index_and_filter_blocks = true; + bbto.whole_key_filtering = false; + bbto.index_type = GetParam(); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); const size_t kTimestampSize = Timestamp(0, 0).size(); TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; DestroyAndReopen(options); - // Insert kNumKeys + WriteOptions write_opts; - Status s; - for (size_t i = 0; i != kNumKeys; ++i) { - std::string ts_str = Timestamp(static_cast(i + 1), 0); - Slice ts = ts_str; - write_opts.timestamp = &ts; - s = db_->Put(write_opts, "foo", "value" + std::to_string(i)); - ASSERT_OK(s); - } - { - ReadOptions read_opts; - std::string ts_str = Timestamp(1, 0); - Slice ts = ts_str; - read_opts.timestamp = &ts; - std::unique_ptr iter(db_->NewIterator(read_opts)); - iter->SeekToFirst(); - CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str); - ASSERT_EQ( - 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); - } + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + ASSERT_OK(db_->Put(write_opts, "foo", "bar")); + + ASSERT_OK(Flush()); + // Read with MultiGet + ts_str = Timestamp(2, 0); + ts = ts_str; + ReadOptions read_opts; + read_opts.timestamp = &ts; + size_t batch_size = 1; + std::vector keys(batch_size); + std::vector values(batch_size); + std::vector timestamps(batch_size); + keys[0] = "foo"; + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + std::vector cfhs(keys.size(), cfh); + std::vector statuses = + db_->MultiGet(read_opts, cfhs, keys, &values, ×tamps); + + ASSERT_OK(statuses[0]); Close(); } -TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) { +TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) { Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; - constexpr size_t kNumKeys = 16; - options.max_sequential_skip_in_iterations = kNumKeys / 2; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); const size_t kTimestampSize = Timestamp(0, 0).size(); TestComparator test_cmp(kTimestampSize); options.comparator = &test_cmp; DestroyAndReopen(options); - // Write kNumKeys + 1 keys + constexpr size_t max_skippable_internal_keys = 2; + const size_t kNumKeys = max_skippable_internal_keys + 2; WriteOptions write_opts; Status s; - for (size_t i = 0; i != kNumKeys; ++i) { - std::string ts_str = Timestamp(static_cast(i + 1), 0); + { + std::string ts_str = Timestamp(1, 0); Slice ts = ts_str; write_opts.timestamp = &ts; - s = db_->Put(write_opts, "a", "value" + std::to_string(i)); - ASSERT_OK(s); + ASSERT_OK(db_->Put(write_opts, "a", "value")); } - { - std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); - WriteBatch batch(0, 0, kTimestampSize); - ASSERT_OK(batch.Put("a", "new_value")); - ASSERT_OK(batch.Put("b", "new_value")); - s = batch.AssignTimestamp(ts_str); - ASSERT_OK(s); - s = db_->Write(write_opts, &batch); + for (size_t i = 0; i < kNumKeys; ++i) { + std::string ts_str = Timestamp(static_cast(i + 1), 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + s = db_->Put(write_opts, "b", "value" + std::to_string(i)); ASSERT_OK(s); } { ReadOptions read_opts; - std::string ts_str = Timestamp(static_cast(kNumKeys + 1), 0); + read_opts.max_skippable_internal_keys = max_skippable_internal_keys; + std::string ts_str = Timestamp(1, 0); Slice ts = ts_str; read_opts.timestamp = &ts; std::unique_ptr iter(db_->NewIterator(read_opts)); - iter->Seek("a"); + iter->SeekToFirst(); iter->Next(); - CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str); - ASSERT_EQ( - 1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); + ASSERT_TRUE(iter->status().IsIncomplete()); } Close(); } -TEST_F(DBBasicTestWithTimestamp, MaxKeysSkipped) { - Options options = CurrentOptions(); +TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) { + Options options = GetDefaultOptions(); options.env = env_; options.create_if_missing = true; const size_t kTimestampSize = Timestamp(0, 0).size(); @@ -506,13 +1412,13 @@ TEST_F(DBBasicTestWithTimestamp, MaxKeysSkipped) { std::string ts_str = Timestamp(1, 0); Slice ts = ts_str; write_opts.timestamp = &ts; - ASSERT_OK(db_->Put(write_opts, "a", "value")); + ASSERT_OK(db_->Put(write_opts, "b", "value")); } for (size_t i = 0; i < kNumKeys; ++i) { std::string ts_str = Timestamp(static_cast(i + 1), 0); Slice ts = ts_str; write_opts.timestamp = &ts; - s = db_->Put(write_opts, "b", "value" + std::to_string(i)); + s = db_->Put(write_opts, "a", "value" + std::to_string(i)); ASSERT_OK(s); } { @@ -522,8 +1428,8 @@ TEST_F(DBBasicTestWithTimestamp, MaxKeysSkipped) { Slice ts = ts_str; read_opts.timestamp = &ts; std::unique_ptr iter(db_->NewIterator(read_opts)); - iter->SeekToFirst(); - iter->Next(); + iter->SeekToLast(); + iter->Prev(); ASSERT_TRUE(iter->status().IsIncomplete()); } Close(); @@ -588,6 +1494,138 @@ TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) { Close(); } +class DBBasicTestWithTimestampFilterPrefixSettings + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple, bool, bool, + std::shared_ptr, bool, double, + BlockBasedTableOptions::IndexType>> { + public: + DBBasicTestWithTimestampFilterPrefixSettings() + : DBBasicTestWithTimestampBase( + "db_basic_test_with_timestamp_filter_prefix") {} +}; + +TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + BlockBasedTableOptions bbto; + bbto.filter_policy = std::get<0>(GetParam()); + bbto.whole_key_filtering = std::get<1>(GetParam()); + bbto.cache_index_and_filter_blocks = std::get<2>(GetParam()); + bbto.index_type = std::get<6>(GetParam()); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.prefix_extractor = std::get<3>(GetParam()); + options.memtable_whole_key_filtering = std::get<4>(GetParam()); + options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam()); + + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + const int kMaxKey = 1000; + + // Write any value + WriteOptions write_opts; + std::string ts_str = Timestamp(1, 0); + Slice ts = ts_str; + write_opts.timestamp = &ts; + + int idx = 0; + for (; idx < kMaxKey / 4; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar")); + } + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + for (; idx < kMaxKey / 2; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar")); + } + + ASSERT_OK(Flush()); + + for (; idx < kMaxKey; idx++) { + ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar")); + ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar")); + } + + // Read with MultiGet + ReadOptions read_opts; + read_opts.timestamp = &ts; + + ReadOptions read_opts_total_order; + read_opts_total_order.timestamp = &ts; + read_opts_total_order.total_order_seek = true; + + for (idx = 0; idx < kMaxKey; idx++) { + size_t batch_size = 4; + std::vector keys_str(batch_size); + std::vector values(batch_size); + std::vector statuses(batch_size); + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + + keys_str[0] = Key1(idx); + keys_str[1] = KeyWithPrefix("foo", idx); + keys_str[2] = Key1(kMaxKey + idx); + keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx); + + auto keys = ConvertStrToSlice(keys_str); + + db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(), + statuses.data()); + + for (int i = 0; i < 2; i++) { + ASSERT_OK(statuses[i]); + } + for (int i = 2; i < 4; i++) { + ASSERT_TRUE(statuses[i].IsNotFound()); + } + + for (int i = 0; i < 2; i++) { + std::string value; + ASSERT_OK(db_->Get(read_opts, keys[i], &value)); + std::unique_ptr it1(db_->NewIterator(read_opts)); + ASSERT_NE(nullptr, it1); + ASSERT_OK(it1->status()); + // TODO(zjay) Fix seek with prefix + // it1->Seek(keys[i]); + // ASSERT_TRUE(it1->Valid()); + } + + for (int i = 2; i < 4; i++) { + std::string value; + Status s = db_->Get(read_opts, keys[i], &value); + ASSERT_TRUE(s.IsNotFound()); + } + } + Close(); +} + +INSTANTIATE_TEST_CASE_P( + Timestamp, DBBasicTestWithTimestampFilterPrefixSettings, + ::testing::Combine( + ::testing::Values( + std::shared_ptr(nullptr), + std::shared_ptr(NewBloomFilterPolicy(10, true)), + std::shared_ptr(NewBloomFilterPolicy(10, + false))), + ::testing::Bool(), ::testing::Bool(), + ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(1)), + std::shared_ptr(NewFixedPrefixTransform(4)), + std::shared_ptr(NewFixedPrefixTransform(7)), + std::shared_ptr(NewFixedPrefixTransform(8))), + ::testing::Bool(), ::testing::Values(0, 0.1), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); + class DataVisibilityTest : public DBBasicTestWithTimestampBase { public: DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") { @@ -1051,7 +2089,7 @@ TEST_F(DataVisibilityTest, MultiGetWithTimestamp) { VerifyDefaultCF(snap0); VerifyDefaultCF(snap1); - Flush(); + ASSERT_OK(Flush()); const Snapshot* snap2 = db_->GetSnapshot(); PutTestData(2); @@ -1137,7 +2175,7 @@ TEST_F(DataVisibilityTest, MultiGetCrossCF) { VerifyDefaultCF(snap0); VerifyDefaultCF(snap1); - Flush(); + ASSERT_OK(Flush()); const Snapshot* snap2 = db_->GetSnapshot(); PutTestData(2); @@ -1518,6 +2556,8 @@ TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) { options.create_if_missing = true; options.env = env_; options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options.memtable_prefix_bloom_size_ratio = 0.1; + options.memtable_whole_key_filtering = true; size_t ts_sz = Timestamp(0, 0).size(); TestComparator test_cmp(ts_sz); @@ -1665,14 +2705,15 @@ class DBBasicTestWithTimestampPrefixSeek : public DBBasicTestWithTimestampBase, public testing::WithParamInterface< std::tuple, - std::shared_ptr, bool>> { + std::shared_ptr, bool, + BlockBasedTableOptions::IndexType>> { public: DBBasicTestWithTimestampPrefixSeek() : DBBasicTestWithTimestampBase( "/db_basic_test_with_timestamp_prefix_seek") {} }; -TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) { +TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { const size_t kNumKeysPerFile = 128; Options options = CurrentOptions(); options.env = env_; @@ -1684,6 +2725,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) { options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); BlockBasedTableOptions bbto; bbto.filter_policy = std::get<1>(GetParam()); + bbto.index_type = std::get<3>(GetParam()); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); @@ -1722,6 +2764,13 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) { "value" + std::to_string(i), write_ts_list[i]); iter->Next(); ASSERT_FALSE(iter->Valid()); + + // Seek to kMinKey + iter->Seek(Key1(kMinKey)); + CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); } const std::vector targets = {kMinKey, kMinKey + 0x10, kMinKey + 0x100, kMaxKey}; @@ -1737,6 +2786,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) { Slice read_ts = read_ts_list[i]; read_opts.timestamp = &read_ts; std::unique_ptr it(db_->NewIterator(read_opts)); + // Forward and backward iterate. for (size_t j = 0; j != targets.size(); ++j) { std::string start_key = Key1(targets[j]); uint64_t expected_ub = @@ -1760,6 +2810,24 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) { it->Next(); } ASSERT_EQ(expected_ub - targets[j] + 1, count); + + count = 0; + expected_key = targets[j]; + it->SeekForPrev(start_key); + uint64_t expected_lb = (targets[j] & kPrefixMask); + while (it->Valid()) { + // Out of prefix + if (!read_opts.prefix_same_as_start && + pe->Transform(it->key()) != pe->Transform(start_key)) { + break; + } + CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue, + "value" + std::to_string(i), write_ts_list[i]); + ++count; + --expected_key; + it->Prev(); + } + ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count); } } } @@ -1772,6 +2840,7 @@ INSTANTIATE_TEST_CASE_P( Timestamp, DBBasicTestWithTimestampPrefixSeek, ::testing::Combine( ::testing::Values( + std::shared_ptr(NewFixedPrefixTransform(1)), std::shared_ptr(NewFixedPrefixTransform(4)), std::shared_ptr(NewFixedPrefixTransform(7)), std::shared_ptr(NewFixedPrefixTransform(8))), @@ -1781,19 +2850,25 @@ INSTANTIATE_TEST_CASE_P( std::shared_ptr( NewBloomFilterPolicy(20 /*bits_per_key*/, false))), - ::testing::Bool())); + ::testing::Bool(), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); class DBBasicTestWithTsIterTombstones : public DBBasicTestWithTimestampBase, public testing::WithParamInterface< std::tuple, - std::shared_ptr, int>> { + std::shared_ptr, int, + BlockBasedTableOptions::IndexType>> { public: DBBasicTestWithTsIterTombstones() : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {} }; -TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) { +TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) { constexpr size_t kNumKeysPerFile = 128; Options options = CurrentOptions(); options.env = env_; @@ -1804,6 +2879,7 @@ TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) { options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); BlockBasedTableOptions bbto; bbto.filter_policy = std::get<1>(GetParam()); + bbto.index_type = std::get<3>(GetParam()); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.num_levels = std::get<2>(GetParam()); DestroyAndReopen(options); @@ -1823,7 +2899,7 @@ TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) { } ++key; } while (true); - // Delete them all + ts = write_ts_strs[1]; write_opts.timestamp = &ts; for (key = kMaxKey; key >= kMinKey; --key) { @@ -1850,6 +2926,13 @@ TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) { ASSERT_EQ("value1" + std::to_string(key), iter->value()); } ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); + + for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid(); + key -= 2, ++count, iter->Prev()) { + ASSERT_EQ(Key1(key), iter->key()); + ASSERT_EQ("value1" + std::to_string(key), iter->value()); + } + ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); } Close(); } @@ -1865,7 +2948,12 @@ INSTANTIATE_TEST_CASE_P( NewBloomFilterPolicy(10, false)), std::shared_ptr( NewBloomFilterPolicy(20, false))), - ::testing::Values(2, 6))); + ::testing::Values(2, 6), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey))); } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc new file mode 100644 index 00000000000..0ae74475284 --- /dev/null +++ b/db/db_write_buffer_manager_test.cc @@ -0,0 +1,801 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_test_util.h" +#include "db/write_thread.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +class DBWriteBufferManagerTest : public DBTestBase, + public testing::WithParamInterface { + public: + DBWriteBufferManagerTest() + : DBTestBase("/db_write_buffer_manager_test", /*env_do_fsync=*/false) {} + bool cost_cache_; +}; + +TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + // This make sures write will go through and if stall was in effect, it will + // end. + ASSERT_OK(Put(0, Key(2), DummyString(1), wo)); +} + +// Test Single DB with multiple writer threads get blocked when +// WriteBufferManager execeeds buffer_size_ and flush is waiting to be +// finished. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + std::unordered_set w_set; + std::vector threads; + int wait_count_db = 0; + int num_writers = 4; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_set.insert(w); + // Allow the flush to continue if all writer threads are blocked. + if (w_set.size() == (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + std::function writer = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + Status tmp = Put(cf, Slice(key), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + // Flow: + // main_writer thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // and they will be blocked. + // | + // | + // Last writer thread will write and when its blocked it will signal Flush to + // continue to clear the stall. + + threads.emplace_back(writer, 1); + // Wait untill first thread (main_writer) writing to DB is blocked and then + // create the multiple writers which will be blocked from getting added to the + // queue because stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_writers; i++) { + threads.emplace_back(writer, i % 4); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_writers); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush +// is waiting to be finished but DBs tries to write meanwhile. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager Limit exceeded. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.Signal(); + // Since this is the last DB, signal Flush to continue. + if (wait_count_db == num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + // Write to DB. + std::function write_db = [&](DB* db) { + Status tmp = db->Put(wo, Key(3), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + // Flow: + // db_ will write and will be blocked (as Flush will on hold and will create + // stall in effect). + // | + // multiple dbs writers will be created to write to that db and they will be + // blocked. + // | + // | + // Last writer will write and when its blocked it will signal Flush to + // continue to clear the stall. + + threads.emplace_back(write_db, db_); + // Wait untill first DB is blocked and then create the multiple writers for + // different DBs which will be blocked from getting added to the queue because + // stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_dbs; i++) { + threads.emplace_back(write_db, dbs[i]); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + ASSERT_EQ(num_dbs + 1, wait_count_db); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple DBs and multiple columns get +// blocked when stall by WriteBufferManager is in effect. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 3; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // dbs[0] is completed. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::unordered_set w_set; + std::vector writer_threads; + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + thread_num.fetch_add(1); + cv.Signal(); + // Allow the flush to continue if all writer threads are blocked. + if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + WriteThread::Writer* w = reinterpret_cast(arg); + { + InstrumentedMutexLock lock(&mutex); + w_set.insert(w); + thread_num.fetch_add(1); + // Allow the flush continue if all writer threads are blocked. + if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + // Write to multiple columns of db_. + std::function write_cf = [&](int cf) { + Status tmp = Put(cf, Key(3), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + // Write to multiple DBs. + std::function write_db = [&](DB* db) { + Status tmp = db->Put(wo, Key(3), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s2 = s2 && tmp.ok(); + }; + + // Flow: + // thread will write to db_ will be blocked (as Flush will on hold, + // buffer_size_ has exceeded and will create stall in effect). + // | + // | + // multiple writers threads writing to different DBs and to db_ across + // multiple columns will be created and they will be blocked due to stall. + // | + // | + // Last writer thread will write and when its blocked it will signal Flush to + // continue to clear the stall. + threads.emplace_back(write_db, db_); + // Wait untill first thread is blocked and then create the multiple writer + // threads. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_dbs; i++) { + // Write to multiple columns of db_. + writer_threads.emplace_back(write_cf, i % 3); + // Write to different dbs. + threads.emplace_back(write_db, dbs[i]); + } + for (auto& t : threads) { + t.join(); + } + for (auto& t : writer_threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + + // Number of DBs blocked. + ASSERT_EQ(num_dbs + 1, wait_count_db); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_dbs); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple columns of db_ by passing +// different values to WriteOption.no_slown_down. +TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + WriteOptions wo; + wo.disableWAL = true; + + CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); + + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // db_ is completed. + + std::unordered_set w_slowdown_set; + std::vector threads; + int wait_count_db = 0; + int num_writers = 4; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + std::atomic w_no_slowdown(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_slowdown_set.insert(w); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load( + std::memory_order_relaxed) == + (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + + std::function write_slow_down = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = false; + Status tmp = Put(cf, Slice(key), DummyString(1), write_op); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + + std::function write_no_slow_down = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = true; + Status tmp = Put(cf, Slice(key), DummyString(1), write_op); + { + InstrumentedMutexLock lock(&mutex); + s2 = s2 && !tmp.ok(); + w_no_slowdown.fetch_add(1); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) == + (unsigned long)num_writers) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }; + + // Flow: + // main_writer thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // with different values of WriteOptions.no_slowdown. Some of them will + // be blocked and some of them will return with Incomplete status. + // | + // | + // Last writer thread will write and when its blocked/return it will signal + // Flush to continue to clear the stall. + threads.emplace_back(write_slow_down, 1); + // Wait untill first thread (main_writer) writing to DB is blocked and then + // create the multiple writers which will be blocked from getting added to the + // queue because stall is in effect. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_writers; i += 2) { + threads.emplace_back(write_no_slow_down, (i) % 4); + threads.emplace_back(write_slow_down, (i + 1) % 4); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_slowdown_set.size(), num_writers / 2); + // Number of Writer threads with WriteOptions.no_slowdown = true. + ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple threads writing across multiple columns of db_ and different +// dbs by passing different values to WriteOption.no_slown_down. +TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 4; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } + + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, true)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } + CreateAndReopenWithCF({"cf1", "cf2"}, options); + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } + WriteOptions wo; + wo.disableWAL = true; + + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + } + // Insert to db_. + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + // WriteBufferManager::buffer_size_ has exceeded after the previous write to + // dbs[0] is completed. + std::vector threads; + int wait_count_db = 0; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::unordered_set w_slowdown_set; + std::vector writer_threads; + std::atomic thread_num(0); + std::atomic w_no_slowdown(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.Signal(); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + WriteThread::Writer* w = reinterpret_cast(arg); + InstrumentedMutexLock lock(&mutex); + w_slowdown_set.insert(w); + // Allow the flush continue if all writer threads are blocked. + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s1 = true, s2 = true; + std::function write_slow_down = [&](DB* db) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = false; + Status tmp = db->Put(write_op, Slice(key), DummyString(1)); + InstrumentedMutexLock lock(&mutex); + s1 = s1 && tmp.ok(); + }; + + std::function write_no_slow_down = [&](DB* db) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + WriteOptions write_op; + write_op.no_slowdown = true; + Status tmp = db->Put(write_op, Slice(key), DummyString(1)); + { + InstrumentedMutexLock lock(&mutex); + s2 = s2 && !tmp.ok(); + w_no_slowdown.fetch_add(1); + if (w_slowdown_set.size() + + (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) + + wait_count_db) == + (unsigned long)(2 * num_dbs + 1)) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + } + }; + + // Flow: + // first thread will write but will be blocked (as Flush will on hold, + // buffer_size_ has exceeded, thus will create stall in effect). + // | + // | + // multiple writer threads will be created to write across multiple columns + // of db_ and different DBs with different values of + // WriteOptions.no_slowdown. Some of them will be blocked and some of them + // will return with Incomplete status. + // | + // | + // Last writer thread will write and when its blocked/return it will signal + // Flush to continue to clear the stall. + threads.emplace_back(write_slow_down, db_); + // Wait untill first thread writing to DB is blocked and then + // create the multiple writers. + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + + for (int i = 0; i < num_dbs; i += 2) { + // Write to multiple columns of db_. + writer_threads.emplace_back(write_slow_down, db_); + writer_threads.emplace_back(write_no_slow_down, db_); + // Write to different DBs. + threads.emplace_back(write_slow_down, dbs[i]); + threads.emplace_back(write_no_slow_down, dbs[i + 1]); + } + + for (auto& t : threads) { + t.join(); + } + + for (auto& t : writer_threads) { + t.join(); + } + + ASSERT_TRUE(s1); + ASSERT_TRUE(s2); + // Number of DBs blocked. + ASSERT_EQ((num_dbs / 2) + 1, wait_count_db); + // Number of writer threads writing to db_ blocked from getting added to the + // queue. + ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2); + // Number of threads with WriteOptions.no_slowdown = true. + ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs); + + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest, + testing::Bool()); + +} // namespace ROCKSDB_NAMESPACE + +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_write_test.cc b/db/db_write_test.cc index b2389605060..4a8f90c2cb2 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -60,14 +60,15 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = false; - dbfull()->Put(wo, key, "bar"); + ASSERT_OK(dbfull()->Put(wo, key, "bar")); }; std::function write_no_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = true; - dbfull()->Put(wo, key, "bar"); + Status s = dbfull()->Put(wo, key, "bar"); + ASSERT_TRUE(s.ok() || s.IsIncomplete()); }; std::function unblock_main_thread_func = [&](void*) { mutex.Lock(); @@ -77,13 +78,13 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { }; // Create 3 L0 files and schedule 4th without waiting - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); @@ -104,7 +105,7 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { // write_thread FlushOptions fopt; fopt.wait = false; - dbfull()->Flush(fopt); + ASSERT_OK(dbfull()->Flush(fopt)); // Create a mix of slowdown/no_slowdown write threads mutex.Lock(); @@ -145,7 +146,7 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { mutex.Unlock(); TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1"); - dbfull()->TEST_WaitForFlushMemTable(nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); // This would have triggered a write stall. Unblock the write group leader TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2"); // The leader is going to create missing newer links. When the leader @@ -178,14 +179,15 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = false; - dbfull()->Put(wo, key, "bar"); + ASSERT_OK(dbfull()->Put(wo, key, "bar")); }; std::function write_no_slowdown_func = [&]() { int a = thread_num.fetch_add(1); std::string key = "foo" + std::to_string(a); WriteOptions wo; wo.no_slowdown = true; - dbfull()->Put(wo, key, "bar"); + Status s = dbfull()->Put(wo, key, "bar"); + ASSERT_TRUE(s.ok() || s.IsIncomplete()); }; std::function unblock_main_thread_func = [&](void *) { mutex.Lock(); @@ -195,13 +197,13 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { }; // Create 3 L0 files and schedule 4th without waiting - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); - Flush(); - Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func); @@ -222,7 +224,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { // write_thread FlushOptions fopt; fopt.wait = false; - dbfull()->Flush(fopt); + ASSERT_OK(dbfull()->Flush(fopt)); // Create a mix of slowdown/no_slowdown write threads mutex.Lock(); @@ -243,7 +245,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { mutex.Unlock(); TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1"); - dbfull()->TEST_WaitForFlushMemTable(nullptr); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); // This would have triggered a write stall. Unblock the write group leader TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2"); // The leader is going to create missing newer links. When the leader finishes, @@ -260,7 +262,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { constexpr int kNumThreads = 5; std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); Reopen(options); @@ -307,6 +309,11 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { threads[i].join(); } ASSERT_EQ(1, leader_count); + + // The Failed PUT operations can cause a BG error to be set. + // Mark it as Checked for the ASSERT_STATUS_CHECKED + dbfull()->Resume().PermitUncheckedError(); + // Close before mock_env destruct. Close(); } @@ -320,7 +327,7 @@ TEST_P(DBWriteTest, ManualWalFlushInEffect) { ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty()); // try the 2nd wal created during SwitchWAL - dbfull()->TEST_SwitchWAL(); + ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_TRUE(Put("key" + ToString(0), "value").ok()); ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); ASSERT_TRUE(dbfull()->FlushWAL(false).ok()); @@ -329,7 +336,7 @@ TEST_P(DBWriteTest, ManualWalFlushInEffect) { TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); Reopen(options); @@ -351,7 +358,9 @@ TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { } */ if (!options.manual_wal_flush) { - ASSERT_FALSE(res.ok()); + ASSERT_NOK(res); + } else { + ASSERT_OK(res); } } // Close before mock_env destruct. @@ -361,7 +370,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) { TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) { Random rnd(301); std::unique_ptr mock_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetOptions(); options.env = mock_env.get(); options.writable_file_max_buffer_size = 4 * 1024 * 1024; @@ -395,7 +404,7 @@ TEST_P(DBWriteTest, LockWalInEffect) { ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false)); ASSERT_OK(dbfull()->UnlockWAL()); // try the 2nd wal created during SwitchWAL - dbfull()->TEST_SwitchWAL(); + ASSERT_OK(dbfull()->TEST_SwitchWAL()); ASSERT_OK(Put("key" + ToString(0), "value")); ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty()); ASSERT_OK(dbfull()->LockWAL()); @@ -423,13 +432,14 @@ TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { ROCKSDB_NAMESPACE::WriteOptions write_option_default; std::string no_wal_key = no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i); - this->Put(no_wal_key, no_wal_value, write_option_disable); + ASSERT_OK( + this->Put(no_wal_key, no_wal_value, write_option_disable)); std::string wal_key = wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); - this->Put(wal_key, wal_value, write_option_default); - dbfull()->SyncWAL(); + ASSERT_OK(this->Put(wal_key, wal_value, write_option_default)); + ASSERT_OK(dbfull()->SyncWAL()); } - return 0; + return; }); } for (auto& t: threads) { diff --git a/db/dbformat.cc b/db/dbformat.cc index ada35f1fb2e..8ac0617649f 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -49,7 +49,8 @@ EntryType GetEntryType(ValueType value_type) { bool ParseFullKey(const Slice& internal_key, FullKey* fkey) { ParsedInternalKey ikey; - if (ParseInternalKey(internal_key, &ikey) != Status::OK()) { + if (!ParseInternalKey(internal_key, &ikey, false /*log_err_key */) + .ok()) { // TODO return false; } fkey->user_key = ikey.user_key; @@ -77,12 +78,34 @@ void AppendInternalKeyFooter(std::string* result, SequenceNumber s, PutFixed64(result, PackSequenceAndType(s, t)); } -std::string ParsedInternalKey::DebugString(bool hex) const { +void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMin(ts_sz, static_cast(0)); + result->append(key.data(), key.size()); + result->append(kTsMin.data(), ts_sz); +} + +void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + const std::string kTsMax(ts_sz, static_cast(0xff)); + result->append(key.data(), key.size()); + result->append(kTsMax.data(), ts_sz); +} + +std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const { + std::string result = "'"; + if (log_err_key) { + result += user_key.ToString(hex); + } else { + result += ""; + } + char buf[50]; snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence, static_cast(type)); - std::string result = "'"; - result += user_key.ToString(hex); + result += buf; return result; } @@ -90,8 +113,8 @@ std::string ParsedInternalKey::DebugString(bool hex) const { std::string InternalKey::DebugString(bool hex) const { std::string result; ParsedInternalKey parsed; - if (ParseInternalKey(rep_, &parsed) == Status::OK()) { - result = parsed.DebugString(hex); + if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) { + result = parsed.DebugString(true /* log_err_key */, hex); // TODO } else { result = "(bad)"; result.append(EscapeString(rep_)); diff --git a/db/dbformat.h b/db/dbformat.h index 81c852ac40a..c3f5c543735 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -112,13 +112,19 @@ struct ParsedInternalKey { // u contains timestamp if user timestamp feature is enabled. ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) : user_key(u), sequence(seq), type(t) {} - std::string DebugString(bool hex = false) const; + std::string DebugString(bool log_err_key, bool hex) const; void clear() { user_key.clear(); sequence = 0; type = kTypeDeletion; } + + void SetTimestamp(const Slice& ts) { + assert(ts.size() <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts.size(); + memcpy(const_cast(addr), ts.data(), ts.size()); + } }; // Return the length of the encoding of "key". @@ -140,8 +146,10 @@ inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, *seq = packed >> 8; *t = static_cast(packed & 0xff); - assert(*seq <= kMaxSequenceNumber); - assert(IsExtendedValueType(*t)); + // Commented the following two assertions in order to test key-value checksum + // on corrupted keys without crashing ("DbKvChecksumTest"). + // assert(*seq <= kMaxSequenceNumber); + // assert(IsExtendedValueType(*t)); } EntryType GetEntryType(ValueType value_type); @@ -161,12 +169,20 @@ extern void AppendInternalKeyWithDifferentTimestamp( extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, ValueType t); +// Append the key and a minimal timestamp to *result +extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// Append the key and a maximal timestamp to *result +extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + // Attempt to parse an internal key from "internal_key". On success, // stores the parsed data in "*result", and returns true. // // On error, returns false, leaves "*result" in an undefined state. extern Status ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result); + ParsedInternalKey* result, bool log_err_key); // Returns the user key portion of an internal key. inline Slice ExtractUserKey(const Slice& internal_key) { @@ -285,8 +301,8 @@ class InternalKey { bool Valid() const { ParsedInternalKey parsed; - return (ParseInternalKey(Slice(rep_), &parsed) == Status::OK()) ? true - : false; + return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */) + .ok()); // TODO } void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } @@ -319,7 +335,7 @@ class InternalKey { AppendInternalKeyFooter(&rep_, s, t); } - std::string DebugString(bool hex = false) const; + std::string DebugString(bool hex) const; }; inline int InternalKeyComparator::Compare(const InternalKey& a, @@ -328,20 +344,27 @@ inline int InternalKeyComparator::Compare(const InternalKey& a, } inline Status ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result) { + ParsedInternalKey* result, bool log_err_key) { const size_t n = internal_key.size(); + if (n < kNumInternalBytes) { - return Status::Corruption("Internal Key too small"); + return Status::Corruption("Corrupted Key: Internal Key too small. Size=" + + std::to_string(n) + ". "); } + uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes); unsigned char c = num & 0xff; result->sequence = num >> 8; result->type = static_cast(c); assert(result->type <= ValueType::kMaxValue); result->user_key = Slice(internal_key.data(), n - kNumInternalBytes); - return IsExtendedValueType(result->type) - ? Status::OK() - : Status::Corruption("Invalid Key Type"); + + if (IsExtendedValueType(result->type)) { + return Status::OK(); + } else { + return Status::Corruption("Corrupted Key", + result->DebugString(log_err_key, true)); + } } // Update the sequence number in the internal key. @@ -475,15 +498,21 @@ class IterKey { // Update the sequence number in the internal key. Guarantees not to // invalidate slices to the key (and the user key). - void UpdateInternalKey(uint64_t seq, ValueType t) { + void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { assert(!IsKeyPinned()); assert(key_size_ >= kNumInternalBytes); + if (ts) { + assert(key_size_ >= kNumInternalBytes + ts->size()); + memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(), + ts->size()); + } uint64_t newval = (seq << 8) | t; EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); } bool IsKeyPinned() const { return (key_ != buf_); } + // user_key does not have timestamp. void SetInternalKey(const Slice& key_prefix, const Slice& user_key, SequenceNumber s, ValueType value_type = kValueTypeForSeek, @@ -587,7 +616,7 @@ class IterKey { void EnlargeBuffer(size_t key_size); }; -// Convert from a SliceTranform of user keys, to a SliceTransform of +// Convert from a SliceTransform of user keys, to a SliceTransform of // user keys. class InternalKeySliceTransform : public SliceTransform { public: diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 09ee4a38b4d..06c5bb2025e 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#include "logging/logging.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { @@ -41,12 +40,12 @@ static void TestKey(const std::string& key, Slice in(encoded); ParsedInternalKey decoded("", 0, kTypeValue); - ASSERT_OK(ParseInternalKey(in, &decoded)); + ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */)); ASSERT_EQ(key, decoded.user_key.ToString()); ASSERT_EQ(seq, decoded.sequence); ASSERT_EQ(vt, decoded.type); - ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded)); + ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */)); } class FormatTest : public testing::Test {}; @@ -186,7 +185,7 @@ TEST_F(FormatTest, UpdateInternalKey) { Slice in(ikey); ParsedInternalKey decoded; - ASSERT_OK(ParseInternalKey(in, &decoded)); + ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */)); ASSERT_EQ(user_key, decoded.user_key.ToString()); ASSERT_EQ(new_seq, decoded.sequence); ASSERT_EQ(new_val_type, decoded.type); diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 635951a1e1c..38bc4960275 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -40,7 +40,7 @@ class DeleteFileTest : public DBTestBase { wal_dir_(dbname_ + "/wal_files") {} void SetOptions(Options* options) { - assert(options); + ASSERT_NE(options, nullptr); options->delete_obsolete_files_period_micros = 0; // always do full purge options->enable_thread_tracking = true; options->write_buffer_size = 1024 * 1024 * 1000; @@ -105,14 +105,14 @@ class DeleteFileTest : public DBTestBase { void CheckFileTypeCounts(const std::string& dir, int required_log, int required_sst, int required_manifest) { std::vector filenames; - env_->GetChildren(dir, &filenames); + ASSERT_OK(env_->GetChildren(dir, &filenames)); int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; for (auto file : filenames) { uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { - log_cnt += (type == kLogFile); + log_cnt += (type == kWalFile); sst_cnt += (type == kTableFile); manifest_cnt += (type == kDescriptorFile); } @@ -180,7 +180,8 @@ TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { ASSERT_TRUE(status.IsInvalidArgument()); // Lowest level file deletion should succeed. - ASSERT_OK(db_->DeleteFile(level2file)); + status = db_->DeleteFile(level2file); + ASSERT_OK(status); } TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { @@ -201,7 +202,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { compact_options.change_level = true; compact_options.target_level = 2; Slice first_slice(first), last_slice(last); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 1 sst after compaction CheckFileTypeCounts(dbname_, 0, 1, 1); @@ -210,7 +211,9 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { Iterator *itr = nullptr; CreateTwoLevels(); itr = db_->NewIterator(ReadOptions()); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(itr->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); + ASSERT_OK(itr->status()); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); delete itr; @@ -237,7 +240,8 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { ReadOptions read_options; read_options.background_purge_on_iterator_cleanup = true; itr = db_->NewIterator(read_options); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(itr->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); test::SleepingBackgroundTask sleeping_task_before; @@ -344,11 +348,12 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { ReadOptions read_options; read_options.background_purge_on_iterator_cleanup = true; itr = db_->NewIterator(read_options); + ASSERT_OK(itr->status()); // ReadOptions is deleted, but iterator cleanup function should not be // affected } - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); delete itr; @@ -382,9 +387,11 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { ReadOptions read_options; read_options.background_purge_on_iterator_cleanup = true; Iterator* itr1 = db_->NewIterator(read_options); + ASSERT_OK(itr1->status()); CreateTwoLevels(); Iterator* itr2 = db_->NewIterator(read_options); - db_->CompactRange(compact_options, &first_slice, &last_slice); + ASSERT_OK(itr2->status()); + ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice)); // 5 sst files after 2 compactions with 2 live iterators CheckFileTypeCounts(dbname_, 0, 5, 1); @@ -417,6 +424,7 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) { CreateTwoLevels(); ReadOptions read_options; Iterator* it = db_->NewIterator(read_options); + ASSERT_OK(it->status()); std::vector metadata; db_->GetLiveFilesMetaData(&metadata); @@ -432,7 +440,7 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) { Status status = db_->DeleteFile(level2file); fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(), status.ToString().c_str()); - ASSERT_TRUE(status.ok()); + ASSERT_OK(status); it->SeekToFirst(); int numKeysIterated = 0; while(it->Valid()) { @@ -452,7 +460,7 @@ TEST_F(DeleteFileTest, DeleteLogFiles) { AddKeys(10, 0); VectorLogPtr logfiles; - db_->GetSortedWalFiles(logfiles); + ASSERT_OK(db_->GetSortedWalFiles(logfiles)); ASSERT_GT(logfiles.size(), 0UL); // Take the last log file which is expected to be alive and try to delete it // Should not succeed because live logs are not allowed to be deleted @@ -461,7 +469,7 @@ TEST_F(DeleteFileTest, DeleteLogFiles) { ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); fprintf(stdout, "Deleting alive log file %s\n", alive_log->PathName().c_str()); - ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); + ASSERT_NOK(db_->DeleteFile(alive_log->PathName())); ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); logfiles.clear(); @@ -469,10 +477,10 @@ TEST_F(DeleteFileTest, DeleteLogFiles) { // Call Flush again to flush out memtable and move alive log to archived log // and try to delete the archived log file FlushOptions fopts; - db_->Flush(fopts); + ASSERT_OK(db_->Flush(fopts)); AddKeys(10, 0); - db_->Flush(fopts); - db_->GetSortedWalFiles(logfiles); + ASSERT_OK(db_->Flush(fopts)); + ASSERT_OK(db_->GetSortedWalFiles(logfiles)); ASSERT_GT(logfiles.size(), 0UL); std::unique_ptr archived_log = std::move(logfiles.front()); ASSERT_EQ(archived_log->Type(), kArchivedLogFile); @@ -480,8 +488,8 @@ TEST_F(DeleteFileTest, DeleteLogFiles) { fprintf(stdout, "Deleting archived log file %s\n", archived_log->PathName().c_str()); ASSERT_OK(db_->DeleteFile(archived_log->PathName())); - ASSERT_EQ(Status::NotFound(), - env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); + ASSERT_TRUE( + env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound()); } TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { @@ -520,6 +528,7 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { { std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); + ASSERT_OK(itr->status()); int count = 0; for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { ASSERT_OK(itr->status()); diff --git a/db/error_handler.cc b/db/error_handler.cc index 7aa4aa82689..b5c353a6908 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -4,9 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" + #include "db/db_impl/db_impl.h" #include "db/event_helpers.h" #include "file/sst_file_manager_impl.h" +#include "logging/logging.h" namespace ROCKSDB_NAMESPACE { @@ -111,6 +113,23 @@ std::map, Status::Code::kIOError, Status::SubCode::kIOFenced, false), Status::Severity::kFatalError}, + // Errors during MANIFEST write when WAL is disabled + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, Status::SubCode::kIOFenced, + false), + Status::Severity::kFatalError}, }; @@ -175,6 +194,12 @@ std::map, {std::make_tuple(BackgroundErrorReason::kFlushNoWAL, Status::Code::kIOError, false), Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, }; std::map, Status::Severity> @@ -244,12 +269,19 @@ void ErrorHandler::CancelErrorRecovery() { // This can also get called as part of a recovery operation. In that case, we // also track the error separately in recovery_error_ so we can tell in the // end whether recovery succeeded or not -Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) { +const Status& ErrorHandler::SetBGError(const Status& bg_err, + BackgroundErrorReason reason) { db_mutex_->AssertHeld(); if (bg_err.ok()) { - return Status::OK(); + return bg_err; } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set regular background error\n"); + bool paranoid = db_options_.paranoid_checks; Status::Severity sev = Status::Severity::kFatalError; Status new_bg_err; @@ -324,11 +356,32 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas return bg_error_; } -Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, - BackgroundErrorReason reason) { +// This is the main function for looking at IO related error during the +// background operations. The main logic is: +// 1) File scope IO error is treated as retryable IO error in the write +// path. In RocksDB, If a file has write IO error and it is at file scope, +// RocksDB never write to the same file again. RocksDB will create a new +// file and rewrite the whole content. Thus, it is retryable. +// 1) if the error is caused by data loss, the error is mapped to +// unrecoverable error. Application/user must take action to handle +// this situation (File scope case is excluded). +// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, +// or its retryable flag is set and not a data loss error), auto resume +// will be called and the auto resume can be controlled by resume count +// and resume interval options. There are three sub-cases: +// a) if the error happens during compaction, it is mapped to a soft error. +// the compaction thread will reschedule a new compaction. +// b) if the error happens during flush and also WAL is empty, it is mapped +// to a soft error. Note that, it includes the case that IO error happens +// in SST or manifest write during flush. +// c) all other errors are mapped to hard error. +// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason +// reason) will be called to handle other error cases. +const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, + BackgroundErrorReason reason) { db_mutex_->AssertHeld(); if (bg_io_err.ok()) { - return Status::OK(); + return bg_io_err; } ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", bg_io_err.ToString().c_str()); @@ -336,45 +389,74 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, if (recovery_in_prog_ && recovery_io_error_.ok()) { recovery_io_error_ = bg_io_err; } - if (BackgroundErrorReason::kManifestWrite == reason) { + if (BackgroundErrorReason::kManifestWrite == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { // Always returns ok db_->DisableFileDeletionsWithLock().PermitUncheckedError(); } Status new_bg_io_err = bg_io_err; - Status s; DBRecoverContext context; - if (bg_io_err.GetDataLoss()) { - // First, data loss is treated as unrecoverable error. So it can directly - // overwrite any existing bg_error_. + if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && + bg_io_err.GetDataLoss()) { + // First, data loss (non file scope) is treated as unrecoverable error. So + // it can directly overwrite any existing bg_error_. bool auto_recovery = false; Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); bg_error_ = bg_err; if (recovery_in_prog_ && recovery_error_.ok()) { recovery_error_ = bg_err; } - EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, - db_mutex_, &auto_recovery); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Set background IO error as unrecoverable error\n"); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &bg_err, db_mutex_, &auto_recovery); recover_context_ = context; return bg_error_; - } else if (bg_io_err.GetRetryable()) { - // Second, check if the error is a retryable IO error or not. if it is - // retryable error and its severity is higher than bg_error_, overwrite - // the bg_error_ with new error. - // In current stage, for retryable IO error of compaction, treat it as - // soft error. In other cases, treat the retryable IO error as hard - // error. + } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace && + (bg_io_err.GetScope() == + IOStatus::IOErrorScope::kIOErrorScopeFile || + bg_io_err.GetRetryable())) { + // Second, check if the error is a retryable IO error (file scope IO error + // is also treated as retryable IO error in RocksDB write path). if it is + // retryable error and its severity is higher than bg_error_, overwrite the + // bg_error_ with new error. In current stage, for retryable IO error of + // compaction, treat it as soft error. In other cases, treat the retryable + // IO error as hard error. Note that, all the NoSpace error should be + // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter + // it is retryable or file scope, this logic will be bypassed. bool auto_recovery = false; - EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, - db_mutex_, &auto_recovery); + EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, + &new_bg_io_err, db_mutex_, + &auto_recovery); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: Set background retryable IO error\n"); if (BackgroundErrorReason::kCompaction == reason) { - Status bg_err(new_bg_io_err, Status::Severity::kSoftError); - if (bg_err.severity() > bg_error_.severity()) { - bg_error_ = bg_err; + // We map the retryable IO error during compaction to soft error. Since + // compaction can reschedule by itself. We will not set the BG error in + // this case + // TODO: a better way to set or clean the retryable IO error which + // happens during compaction SST file write. + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); } - recover_context_ = context; + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Compaction will schedule by itself to resume\n"); return bg_error_; - } else if (BackgroundErrorReason::kFlushNoWAL == reason) { + } else if (BackgroundErrorReason::kFlushNoWAL == reason || + BackgroundErrorReason::kManifestWriteNoWAL == reason) { // When the BG Retryable IO error reason is flush without WAL, // We map it to a soft error. At the same time, all the background work // should be stopped except the BG work from recovery. Therefore, we @@ -405,12 +487,14 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, return StartRecoverFromRetryableBGIOError(bg_io_err); } } else { - s = SetBGError(new_bg_io_err, reason); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + } + return SetBGError(new_bg_io_err, reason); } - return s; } -Status ErrorHandler::OverrideNoSpaceError(Status bg_error, +Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery) { #ifndef ROCKSDB_LITE if (bg_error.severity() >= Status::Severity::kFatalError) { @@ -466,7 +550,11 @@ Status ErrorHandler::ClearBGError() { // Signal that recovery succeeded if (recovery_error_.ok()) { Status old_bg_error = bg_error_; + // Clear and check the recovery IO and BG error bg_error_ = Status::OK(); + recovery_io_error_ = IOStatus::OK(); + bg_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); recovery_in_prog_ = false; soft_error_no_bg_work_ = false; EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, @@ -516,6 +604,7 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { // during the recovery process. While recovering, the only operations that // can generate background errors should be the flush operations recovery_error_ = Status::OK(); + recovery_error_.PermitUncheckedError(); Status s = db_->ResumeImpl(recover_context_); if (s.ok()) { soft_error_no_bg_work_ = false; @@ -537,24 +626,39 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { #endif } -Status ErrorHandler::StartRecoverFromRetryableBGIOError(IOStatus io_error) { +const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( + const IOStatus& io_error) { #ifndef ROCKSDB_LITE db_mutex_->AssertHeld(); - if (bg_error_.ok() || io_error.ok()) { - return Status::OK(); - } - if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_ || - recovery_thread_) { + if (bg_error_.ok()) { + return bg_error_; + } else if (io_error.ok()) { + return io_error; + } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { // Auto resume BG error is not enabled, directly return bg_error_. return bg_error_; } + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); + } + ROCKS_LOG_INFO( + db_options_.info_log, + "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); + if (recovery_thread_) { + // In this case, if recovery_in_prog_ is false, current thread should + // wait the previous recover thread to finish and create a new thread + // to recover from the bg error. + db_mutex_->Unlock(); + recovery_thread_->join(); + db_mutex_->Lock(); + } recovery_in_prog_ = true; recovery_thread_.reset( new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); if (recovery_io_error_.ok() && recovery_error_.ok()) { - return Status::OK(); + return recovery_error_; } else { TEST_SYNC_POINT("StartRecoverRetryableBGIOError:RecoverFail"); return bg_error_; @@ -578,6 +682,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { DBRecoverContext context = recover_context_; int resume_count = db_options_.max_bgerror_resume_count; uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; + uint64_t retry_count = 0; // Recover from the retryable error. Create a separate thread to do it. while (resume_count > 0) { if (end_recovery_) { @@ -587,15 +692,24 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); recovery_io_error_ = IOStatus::OK(); recovery_error_ = Status::OK(); + retry_count++; Status s = db_->ResumeImpl(context); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1"); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT); + } if (s.IsShutdownInProgress() || bg_error_.severity() >= Status::Severity::kFatalError) { // If DB shutdown in progress or the error severity is higher than // Hard Error, stop auto resume and returns. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0"); recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } return; } if (!recovery_io_error_.ok() && @@ -606,7 +720,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { // a period of time and redo auto resume if it is allowed. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1"); - int64_t wait_until = db_->env_->NowMicros() + wait_interval; + int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; cv_.TimedWait(wait_until); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterWait0"); } else { @@ -619,8 +733,15 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); Status old_bg_error = bg_error_; bg_error_ = Status::OK(); + bg_error_.PermitUncheckedError(); EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, old_bg_error, db_mutex_); + if (bg_error_stats_ != nullptr) { + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } recovery_in_prog_ = false; if (soft_error_no_bg_work_) { soft_error_no_bg_work_ = false; @@ -631,6 +752,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { // In this case: 1) recovery_io_error is more serious or not retryable // 2) other Non IO recovery_error happens. The auto recovery stops. recovery_in_prog_ = false; + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } return; } } @@ -638,6 +763,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { } recovery_in_prog_ = false; TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut"); + if (bg_error_stats_ != nullptr) { + RecordInHistogram(bg_error_stats_.get(), + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); + } return; #else return; diff --git a/db/error_handler.h b/db/error_handler.h index 084434101aa..ab1169bc907 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -31,17 +31,15 @@ class ErrorHandler { InstrumentedMutex* db_mutex) : db_(db), db_options_(db_options), - bg_error_(Status::OK()), - recovery_error_(Status::OK()), - recovery_io_error_(IOStatus::OK()), cv_(db_mutex), end_recovery_(false), recovery_thread_(nullptr), db_mutex_(db_mutex), auto_recovery_(false), recovery_in_prog_(false), - soft_error_no_bg_work_(false) {} - ~ErrorHandler() { + soft_error_no_bg_work_(false), + bg_error_stats_(db_options.statistics) { + // Clear the checked flag for uninitialized errors bg_error_.PermitUncheckedError(); recovery_error_.PermitUncheckedError(); recovery_io_error_.PermitUncheckedError(); @@ -53,13 +51,14 @@ class ErrorHandler { Status::Code code, Status::SubCode subcode); - Status SetBGError(const Status& bg_err, BackgroundErrorReason reason); + const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); - Status SetBGError(const IOStatus& bg_io_err, BackgroundErrorReason reason); + const Status& SetBGError(const IOStatus& bg_io_err, + BackgroundErrorReason reason); - Status GetBGError() { return bg_error_; } + Status GetBGError() const { return bg_error_; } - Status GetRecoveryError() { return recovery_error_; } + Status GetRecoveryError() const { return recovery_error_; } Status ClearBGError(); @@ -104,15 +103,18 @@ class ErrorHandler { bool auto_recovery_; bool recovery_in_prog_; // A flag to indicate that for the soft error, we should not allow any - // backrgound work execpt the work is from recovery. + // background work except the work is from recovery. bool soft_error_no_bg_work_; // Used to store the context for recover, such as flush reason. DBRecoverContext recover_context_; - Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); + // The pointer of DB statistics. + std::shared_ptr bg_error_stats_; + + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); void RecoverFromNoSpace(); - Status StartRecoverFromRetryableBGIOError(IOStatus io_error); + const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); void RecoverFromRetryableBGIOError(); }; diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 895c878ab6e..e0456a82737 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -9,9 +9,9 @@ #ifndef ROCKSDB_LITE #include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" #include "port/stack_trace.h" #include "rocksdb/io_status.h" -#include "rocksdb/perf_context.h" #include "rocksdb/sst_file_manager.h" #if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" @@ -25,7 +25,10 @@ namespace ROCKSDB_NAMESPACE { class DBErrorHandlingFSTest : public DBTestBase { public: DBErrorHandlingFSTest() - : DBTestBase("/db_error_handling_fs_test", /*env_do_fsync=*/true) {} + : DBTestBase("/db_error_handling_fs_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } std::string GetManifestNameFromLiveFiles() { std::vector live_files; @@ -44,21 +47,9 @@ class DBErrorHandlingFSTest : public DBTestBase { } return ""; } -}; - -class DBErrorHandlingFS : public FileSystemWrapper { - public: - DBErrorHandlingFS() - : FileSystemWrapper(FileSystem::Default()), - trig_no_space(false), - trig_io_error(false) {} - - void SetTrigNoSpace() { trig_no_space = true; } - void SetTrigIoError() { trig_io_error = true; } - private: - bool trig_no_space; - bool trig_io_error; + std::shared_ptr fault_fs_; + std::unique_ptr fault_env_; }; class ErrorHandlerFSListener : public EventListener { @@ -161,15 +152,13 @@ class ErrorHandlerFSListener : public EventListener { }; TEST_F(DBErrorHandlingFSTest, FLushWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -177,32 +166,89 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) { ASSERT_OK(Put(Key(0), "val")); SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Reopen(options); ASSERT_EQ("val", Get(Key(0))); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); +// All the NoSpace IOError will be handled as the regular BG Error no matter the +// retryable flag is set of not. So the auto resume for retryable IO Error will +// not be triggered. Also, it is mapped as hard error. +TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 2; + options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error"); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -214,59 +260,242 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { ASSERT_OK(Put(Key(1), "val1")); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Reopen(options); ASSERT_EQ("val1", Get(Key(1))); ASSERT_OK(Put(Key(2), "val2")); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeSyncTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Reopen(options); ASSERT_EQ("val2", Get(Key(2))); ASSERT_OK(Put(Key(3), "val3")); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeCloseTableFile", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Reopen(options); ASSERT_EQ("val3", Get(Key(3))); Destroy(options); } +TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + // not file scope, but retyrable set + error_msg.SetDataLoss(false); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + listener->EnableAutoRecovery(false); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:Start", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = false; + ASSERT_OK(Put(Key(1), "val1", wo)); + + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + auto cfh = dbfull()->GetColumnFamilyHandle(1); + s = dbfull()->DropColumnFamily(cfh); + + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + options.atomic_flush = true; + Status s; + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + listener->EnableAutoRecovery(false); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::SyncClosedLogs:Start", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = false; + ASSERT_OK(Put(Key(1), "val1", wo)); + + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + auto cfh = dbfull()->GetColumnFamilyHandle(1); + s = dbfull()->DropColumnFamily(cfh); + + s = dbfull()->Resume(); + ASSERT_OK(s); + ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(Put(Key(3), "val3", wo)); + ASSERT_EQ("val3", Get(Key(3))); + s = Flush(); + ASSERT_OK(s); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -280,16 +509,16 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { ASSERT_OK(Put(Key(1), "val1", wo)); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_OK(Put(Key(2), "val2", wo)); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); ASSERT_EQ("val2", Get(Key(2))); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val2", Get(Key(2))); ASSERT_OK(Put(Key(3), "val3", wo)); @@ -297,18 +526,27 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { s = Flush(); ASSERT_OK(s); ASSERT_EQ("val3", Get(Key(3))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; @@ -326,16 +564,16 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) { ASSERT_OK(Put(Key(1), "val1", wo)); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeSyncTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_OK(Put(Key(2), "val2", wo)); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); ASSERT_EQ("val2", Get(Key(2))); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val2", Get(Key(2))); ASSERT_OK(Put(Key(3), "val3", wo)); @@ -347,14 +585,11 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); +TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; @@ -372,16 +607,16 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) { ASSERT_OK(Put(Key(1), "val1", wo)); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeCloseTableFile", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_OK(Put(Key(2), "val2", wo)); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); ASSERT_EQ("val2", Get(Key(2))); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val2", Get(Key(2))); ASSERT_OK(Put(Key(3), "val3", wo)); @@ -394,13 +629,10 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) { } TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); Status s; @@ -416,16 +648,17 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { ASSERT_OK(Put(Key(1), "val")); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); @@ -437,13 +670,10 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { } TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 0; @@ -463,15 +693,106 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { ASSERT_OK(Put(Key(1), "val")); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + +TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(0), "val", wo)); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val", wo)); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); @@ -483,13 +804,10 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { } TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); Status s; @@ -505,23 +823,24 @@ TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { ASSERT_OK(Put(Key(1), "val")); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); // This Resume() will attempt to create a new manifest file and fail again s = dbfull()->Resume(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); // A successful Resume() will create a new manifest file s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); @@ -533,13 +852,14 @@ TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { } TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -553,7 +873,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { ASSERT_OK(Put(Key(0), "val")); ASSERT_OK(Put(Key(2), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( // Wait for flush of 2nd L0 file before starting compaction @@ -573,8 +893,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", [&](void*) { if (fail_manifest.load()) { - fault_fs->SetFilesystemActive(false, - IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -583,18 +903,18 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { // This Flush will trigger a compaction, which will fail when appending to // the manifest s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); TEST_SYNC_POINT("CompactionManifestWriteError:0"); // Clear all errors so when the compaction is retried, it will succeed - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); TEST_SYNC_POINT("CompactionManifestWriteError:1"); TEST_SYNC_POINT("CompactionManifestWriteError:2"); s = dbfull()->TEST_WaitForCompact(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); @@ -606,13 +926,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { } TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -630,7 +947,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { ASSERT_OK(Put(Key(0), "val")); ASSERT_OK(Put(Key(2), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); listener->EnableAutoRecovery(false); @@ -649,14 +966,14 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", [&](void*) { if (fail_manifest.load()) { - fault_fs->SetFilesystemActive(false, error_msg); + fault_fs_->SetFilesystemActive(false, error_msg); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); TEST_SYNC_POINT("CompactionManifestWriteError:0"); TEST_SYNC_POINT("CompactionManifestWriteError:1"); @@ -664,11 +981,11 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); @@ -681,13 +998,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { } TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -697,7 +1011,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { ASSERT_OK(Put(Key(0), "va;")); ASSERT_OK(Put(Key(2), "va;")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); listener->OverrideBGError( Status(Status::NoSpace(), Status::Severity::kHardError)); @@ -707,31 +1021,29 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { "BackgroundCallCompaction:0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BackgroundCallCompaction:0", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); +TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) { std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -745,7 +1057,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) { ASSERT_OK(Put(Key(0), "va;")); ASSERT_OK(Put(Key(2), "va;")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); listener->EnableAutoRecovery(false); @@ -754,30 +1066,80 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) { "BackgroundCallCompaction:0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::OpenCompactionOutputFile", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { CancelAllBackgroundWork(dbfull()); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); - s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { CancelAllBackgroundWork(dbfull()); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); - fault_fs->SetFilesystemActive(true); + s = dbfull()->TEST_GetBGError(); + ASSERT_OK(s); + + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); Destroy(options); } TEST_F(DBErrorHandlingFSTest, CorruptionError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; Status s; @@ -786,42 +1148,44 @@ TEST_F(DBErrorHandlingFSTest, CorruptionError) { ASSERT_OK(Put(Key(0), "va;")); ASSERT_OK(Put(Key(2), "va;")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::FlushMemTable:FlushMemTableFinished", "BackgroundCallCompaction:0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BackgroundCallCompaction:0", [&](void*) { - fault_fs->SetFilesystemActive(false, - IOStatus::Corruption("Corruption")); + fault_fs_->SetFilesystemActive(false, + IOStatus::Corruption("Corruption")); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); - ASSERT_NE(s, Status::OK()); + ASSERT_NOK(s); Destroy(options); } TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(); @@ -829,17 +1193,29 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { ASSERT_OK(Put(Key(0), "val")); SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); ASSERT_EQ(listener->WaitForRecovery(5000000), true); s = Put(Key(1), "val"); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_EQ(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -848,13 +1224,10 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { } TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); Status s; @@ -864,7 +1237,7 @@ TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) { ASSERT_OK(Put(Key(0), "val")); SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); @@ -872,17 +1245,18 @@ TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) { // We should be able to shutdown the database while auto recovery is going // on in the background Close(); - DestroyDB(dbname_, options); + DestroyDB(dbname_, options).PermitUncheckedError(); } TEST_F(DBErrorHandlingFSTest, WALWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.writable_file_max_buffer_size = 32768; options.listeners.emplace_back(listener); @@ -901,7 +1275,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; { @@ -916,8 +1290,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { write_error++; if (write_error > 2) { - fault_fs->SetFilesystemActive(false, - IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); } }); SyncPoint::GetInstance()->EnableProcessing(); @@ -927,7 +1301,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { ASSERT_EQ(s, s.NoSpace()); } SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); ASSERT_EQ(listener->WaitForRecovery(5000000), true); for (auto i = 0; i < 199; ++i) { if (i < 100) { @@ -948,19 +1322,15 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { } TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.writable_file_max_buffer_size = 32768; options.listeners.emplace_back(listener); options.paranoid_checks = true; options.max_bgerror_resume_count = 0; - Status s; Random rnd(301); DestroyAndReopen(options); @@ -978,7 +1348,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; // For the second batch, the first 2 file Append are successful, then the @@ -995,16 +1365,16 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { write_error++; if (write_error > 2) { - fault_fs->SetFilesystemActive(false, error_msg); + fault_fs_->SetFilesystemActive(false, error_msg); } }); SyncPoint::GetInstance()->EnableProcessing(); WriteOptions wopts; wopts.sync = true; - s = dbfull()->Write(wopts, &batch); - ASSERT_EQ(true, s.IsIOError()); + Status s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsIOError()); } - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); @@ -1018,8 +1388,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { } // Resume and write a new batch, should be in the WAL - s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(dbfull()->Resume()); { WriteBatch batch; @@ -1029,7 +1398,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; Reopen(options); @@ -1044,17 +1413,17 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { } TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.writable_file_max_buffer_size = 32768; options.listeners.emplace_back(listener); - Status s; Random rnd(301); listener->EnableAutoRecovery(); @@ -1071,7 +1440,7 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; { @@ -1087,18 +1456,18 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { write_error++; if (write_error > 2) { - fault_fs->SetFilesystemActive(false, - IOStatus::NoSpace("Out of space")); + fault_fs_->SetFilesystemActive(false, + IOStatus::NoSpace("Out of space")); } }); SyncPoint::GetInstance()->EnableProcessing(); WriteOptions wopts; wopts.sync = true; - s = dbfull()->Write(wopts, &batch); - ASSERT_EQ(s, s.NoSpace()); + Status s = dbfull()->Write(wopts, &batch); + ASSERT_TRUE(s.IsNoSpace()); } SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); ASSERT_EQ(listener->WaitForRecovery(5000000), true); for (auto i = 1; i < 4; ++i) { @@ -1129,7 +1498,11 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { } TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { - FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_); std::vector> fault_envs; std::vector fault_fs; std::vector options; @@ -1142,7 +1515,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { for (auto i = 0; i < kNumDbInstances; ++i) { listener.emplace_back(new ErrorHandlerFSListener()); options.emplace_back(GetDefaultOptions()); - fault_fs.emplace_back(new FaultInjectionTestFS(FileSystem::Default())); + fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem())); std::shared_ptr fs(fault_fs.back()); fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs)); options[i].env = fault_envs.back().get(); @@ -1159,9 +1532,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { listener[i]->InjectFileCreationError(fault_fs[i], 3, IOStatus::NoSpace("Out of space")); snprintf(buf, sizeof(buf), "_%d", i); - DestroyDB(dbname_ + std::string(buf), options[i]); - ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), - Status::OK()); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); db.emplace_back(dbptr); } @@ -1174,8 +1546,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); } def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); @@ -1189,8 +1561,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); } for (auto i = 0; i < kNumDbInstances; ++i) { @@ -1203,8 +1575,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { for (auto i = 0; i < kNumDbInstances; ++i) { std::string prop; ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); - ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), - Status::OK()); + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); EXPECT_TRUE(db[i]->GetProperty( "rocksdb.num-files-at-level" + NumberToString(0), &prop)); EXPECT_EQ(atoi(prop.c_str()), 0); @@ -1213,6 +1584,10 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { EXPECT_EQ(atoi(prop.c_str()), 1); } + SstFileManagerImpl* sfmImpl = + static_cast_with_check(sfm.get()); + sfmImpl->Close(); + for (auto i = 0; i < kNumDbInstances; ++i) { char buf[16]; snprintf(buf, sizeof(buf), "_%d", i); @@ -1221,7 +1596,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { if (getenv("KEEP_DB")) { printf("DB is still at %s%s\n", dbname_.c_str(), buf); } else { - Status s = DestroyDB(dbname_ + std::string(buf), options[i]); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); } } options.clear(); @@ -1230,7 +1605,11 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { } TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { - FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default()); + if (mem_env_ != nullptr) { + ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); + return; + } + FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_); std::vector> fault_envs; std::vector fault_fs; std::vector options; @@ -1243,7 +1622,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { for (auto i = 0; i < kNumDbInstances; ++i) { listener.emplace_back(new ErrorHandlerFSListener()); options.emplace_back(GetDefaultOptions()); - fault_fs.emplace_back(new FaultInjectionTestFS(FileSystem::Default())); + fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem())); std::shared_ptr fs(fault_fs.back()); fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs)); options[i].env = fault_envs.back().get(); @@ -1272,9 +1651,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { break; } snprintf(buf, sizeof(buf), "_%d", i); - DestroyDB(dbname_ + std::string(buf), options[i]); - ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr), - Status::OK()); + ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); + ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr)); db.emplace_back(dbptr); } @@ -1287,8 +1665,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + ASSERT_OK(db[i]->Write(wopts, &batch)); + ASSERT_OK(db[i]->Flush(FlushOptions())); } def_env->SetFilesystemActive(false, Status::NoSpace("Out of space")); @@ -1302,11 +1680,11 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK()); + ASSERT_OK(db[i]->Write(wopts, &batch)); if (i != 1) { - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK()); + ASSERT_OK(db[i]->Flush(FlushOptions())); } else { - ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace()); + ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace()); } } @@ -1320,7 +1698,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { ASSERT_EQ(s.severity(), Status::Severity::kHardError); break; case 2: - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); break; } fault_fs[i]->SetFilesystemActive(true); @@ -1333,8 +1711,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); } if (i == 1) { - ASSERT_EQ(static_cast(db[i])->TEST_WaitForCompact(true), - Status::OK()); + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); } EXPECT_TRUE(db[i]->GetProperty( "rocksdb.num-files-at-level" + NumberToString(0), &prop)); @@ -1344,6 +1721,10 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { EXPECT_EQ(atoi(prop.c_str()), 1); } + SstFileManagerImpl* sfmImpl = + static_cast_with_check(sfm.get()); + sfmImpl->Close(); + for (auto i = 0; i < kNumDbInstances; ++i) { char buf[16]; snprintf(buf, sizeof(buf), "_%d", i); @@ -1352,7 +1733,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { if (getenv("KEEP_DB")) { printf("DB is still at %s%s\n", dbname_.c_str(), buf); } else { - DestroyDB(dbname_ + std::string(buf), options[i]); + EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i])); } } options.clear(); @@ -1364,19 +1745,17 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { // to soft error and trigger auto resume. During auto resume, SwitchMemtable // is disabled to avoid small SST tables. Write can still be applied before // the bg error is cleaned unless the memtable is full. -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) { +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { // Activate the FS before the first resume - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -1393,7 +1772,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) { "FLushWritNoWALRetryableeErrorAutoRecover1:1"}}); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); @@ -1403,7 +1782,23 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) { ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val1", Get(Key(1))); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(3, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_GE(autoresume_retry.max, 0); ASSERT_OK(Put(Key(2), "val2", wo)); s = Flush(); // Since auto resume fails, the bg error is not cleand, flush will @@ -1412,29 +1807,25 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) { ASSERT_EQ("val2", Get(Key(2))); // call auto resume - s = dbfull()->Resume(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(dbfull()->Resume()); ASSERT_OK(Put(Key(3), "val3", wo)); - s = Flush(); // After resume is successful, the flush should be ok. - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Flush()); ASSERT_EQ("val3", Get(Key(3))); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) { +TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) { // Activate the FS before the first resume - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; options.bgerror_resume_retry_interval = 100000; // 0.1 second + options.statistics = CreateDBStatistics(); Status s; listener->EnableAutoRecovery(false); @@ -1448,16 +1839,32 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) { ASSERT_OK(Put(Key(1), "val1", wo)); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); ASSERT_EQ(listener->WaitForRecovery(5000000), true); ASSERT_EQ("val1", Get(Key(1))); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT)); + ASSERT_EQ(1, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT)); + ASSERT_LE(0, options.statistics->getAndResetTickerCount( + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT)); + HistogramData autoresume_retry; + options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + &autoresume_retry); + ASSERT_GE(autoresume_retry.max, 0); ASSERT_OK(Put(Key(2), "val2", wo)); s = Flush(); // Since auto resume is successful, the bg error is cleaned, flush will @@ -1467,68 +1874,12 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover1) { - // Fail the first resume and make the second resume successful - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); - std::shared_ptr listener( - new ErrorHandlerFSListener()); - Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); - options.create_if_missing = true; - options.listeners.emplace_back(listener); - options.max_bgerror_resume_count = 2; - options.bgerror_resume_retry_interval = 100000; // 0.1 second - Status s; - - listener->EnableAutoRecovery(false); - DestroyAndReopen(options); - - IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); - error_msg.SetRetryable(true); - - ASSERT_OK(Put(Key(1), "val1")); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"RecoverFromRetryableBGIOError:BeforeWait0", - "FLushWritRetryableeErrorAutoRecover1:0"}, - {"FLushWritRetryableeErrorAutoRecover1:1", - "RecoverFromRetryableBGIOError:BeforeWait1"}, - {"RecoverFromRetryableBGIOError:RecoverSuccess", - "FLushWritRetryableeErrorAutoRecover1:2"}}); - SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:0"); - fault_fs->SetFilesystemActive(true); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:1"); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:2"); - SyncPoint::GetInstance()->DisableProcessing(); - - ASSERT_EQ("val1", Get(Key(1))); - Reopen(options); - ASSERT_EQ("val1", Get(Key(1))); - ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - ASSERT_EQ("val2", Get(Key(2))); - - Destroy(options); -} - -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) { +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) { // Activate the FS before the first resume - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; @@ -1544,35 +1895,31 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) { ASSERT_OK(Put(Key(1), "val1")); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); ASSERT_EQ(listener->WaitForRecovery(5000000), true); ASSERT_EQ("val1", Get(Key(1))); Reopen(options); ASSERT_EQ("val1", Get(Key(1))); ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Flush()); ASSERT_EQ("val2", Get(Key(2))); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) { +TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { // Fail all the resume and let user to resume - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; @@ -1587,19 +1934,19 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) { ASSERT_OK(Put(Key(1), "val1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"FLushWritRetryableeErrorAutoRecover3:0", + {{"FLushWritRetryableeErrorAutoRecover2:0", "RecoverFromRetryableBGIOError:BeforeStart"}, {"RecoverFromRetryableBGIOError:LoopOut", - "FLushWritRetryableeErrorAutoRecover3:1"}}); + "FLushWritRetryableeErrorAutoRecover2:1"}}); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover3:0"); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover3:1"); - fault_fs->SetFilesystemActive(true); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0"); + TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1"); + fault_fs_->SetFilesystemActive(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); @@ -1608,203 +1955,73 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) { // resume manually here. s = dbfull()->Resume(); ASSERT_EQ("val1", Get(Key(1))); - ASSERT_EQ(s, Status::OK()); - ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - ASSERT_EQ("val2", Get(Key(2))); - - Destroy(options); -} - -TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover4) { - // Fail the first resume and does not do resume second time because - // the IO error severity is Fatal Error and not Retryable. - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); - std::shared_ptr listener( - new ErrorHandlerFSListener()); - Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); - options.create_if_missing = true; - options.listeners.emplace_back(listener); - options.max_bgerror_resume_count = 2; - options.bgerror_resume_retry_interval = 10; // 0.1 second - Status s; - - listener->EnableAutoRecovery(false); - DestroyAndReopen(options); - - IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); - error_msg.SetRetryable(true); - IOStatus nr_msg = IOStatus::IOError("No Retryable Fatal IO Error"); - nr_msg.SetRetryable(false); - - ASSERT_OK(Put(Key(1), "val1")); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"RecoverFromRetryableBGIOError:BeforeStart", - "FLushWritRetryableeErrorAutoRecover4:0"}, - {"FLushWritRetryableeErrorAutoRecover4:2", - "RecoverFromRetryableBGIOError:RecoverFail0"}}); - SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); - SyncPoint::GetInstance()->SetCallBack( - "RecoverFromRetryableBGIOError:BeforeResume1", - [&](void*) { fault_fs->SetFilesystemActive(false, nr_msg); }); - - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover4:0"); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover4:2"); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); - // Even the FS is recoverd, due to the Fatal Error in bg_error_ the resume - // and flush will all fail. - ASSERT_EQ("val1", Get(Key(1))); - s = dbfull()->Resume(); - ASSERT_NE(s, Status::OK()); - ASSERT_EQ("val1", Get(Key(1))); - ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_NE(s, Status::OK()); - ASSERT_EQ("NOT_FOUND", Get(Key(2))); - - Reopen(options); - ASSERT_EQ("val1", Get(Key(1))); + ASSERT_OK(s); ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Flush()); ASSERT_EQ("val2", Get(Key(2))); Destroy(options); } -TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover5) { - // During the resume, call DB->CLose, make sure the resume thread exist - // before close continues. Due to the shutdown, the resume is not successful - // and the FS does not become active, so close status is still IO error - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); +TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { + // Fail the first resume and let the second resume be successful std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; - options.bgerror_resume_retry_interval = 10; // 0.1 second + options.bgerror_resume_retry_interval = 100000; // 0.1 second Status s; + std::string old_manifest; + std::string new_manifest; listener->EnableAutoRecovery(false); DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); error_msg.SetRetryable(true); - ASSERT_OK(Put(Key(1), "val1")); + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"RecoverFromRetryableBGIOError:BeforeStart", - "FLushWritRetryableeErrorAutoRecover5:0"}}); - SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); - SyncPoint::GetInstance()->EnableProcessing(); - s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover5:0"); - // The first resume will cause recovery_error and its severity is the - // Fatal error - s = dbfull()->Close(); - ASSERT_NE(s, Status::OK()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); - - Reopen(options); - ASSERT_NE("val1", Get(Key(1))); - ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - ASSERT_EQ("val2", Get(Key(2))); - - Destroy(options); -} - -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover6) { - // During the resume, call DB->CLose, make sure the resume thread exist - // before close continues. Due to the shutdown, the resume is not successful - // and the FS does not become active, so close status is still IO error - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); - std::shared_ptr listener( - new ErrorHandlerFSListener()); - Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); - options.create_if_missing = true; - options.listeners.emplace_back(listener); - options.max_bgerror_resume_count = 2; - options.bgerror_resume_retry_interval = 10; // 0.1 second - Status s; - - listener->EnableAutoRecovery(false); - DestroyAndReopen(options); - - IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); - error_msg.SetRetryable(true); - - ASSERT_OK(Put(Key(1), "val1")); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"FLushWritRetryableeErrorAutoRecover6:0", - "RecoverFromRetryableBGIOError:BeforeStart"}, - {"RecoverFromRetryableBGIOError:BeforeWait0", - "FLushWritRetryableeErrorAutoRecover6:1"}, - {"FLushWritRetryableeErrorAutoRecover6:2", + "ManifestWriteRetryableErrorAutoRecover:0"}, + {"ManifestWriteRetryableErrorAutoRecover:1", "RecoverFromRetryableBGIOError:BeforeWait1"}, - {"RecoverFromRetryableBGIOError:AfterWait0", - "FLushWritRetryableeErrorAutoRecover6:3"}}); + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "ManifestWriteRetryableErrorAutoRecover:2"}}); SyncPoint::GetInstance()->SetCallBack( - "BuildTable:BeforeFinishBuildTable", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:0"); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:1"); - fault_fs->SetFilesystemActive(true); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0"); + fault_fs_->SetFilesystemActive(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:2"); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:3"); - // The first resume will cause recovery_error and its severity is the - // Fatal error - s = dbfull()->Close(); - ASSERT_EQ(s, Status::OK()); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1"); + TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2"); SyncPoint::GetInstance()->DisableProcessing(); - Reopen(options); - ASSERT_EQ("val1", Get(Key(1))); - ASSERT_OK(Put(Key(2), "val2")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); - ASSERT_EQ("val2", Get(Key(2))); + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); - Destroy(options); + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); } -TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { +TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) { // Fail the first resume and let the second resume be successful - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.max_bgerror_resume_count = 2; @@ -1820,27 +2037,29 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); error_msg.SetRetryable(true); - ASSERT_OK(Put(Key(0), "val")); + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + ASSERT_OK(Put(Key(0), "val", wo)); ASSERT_OK(Flush()); - ASSERT_OK(Put(Key(1), "val")); + ASSERT_OK(Put(Key(1), "val", wo)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"RecoverFromRetryableBGIOError:BeforeStart", - "ManifestWriteRetryableErrorAutoRecover:0"}, - {"ManifestWriteRetryableErrorAutoRecover:1", + "ManifestWriteNoWALRetryableErrorAutoRecover:0"}, + {"ManifestWriteNoWALRetryableErrorAutoRecover:1", "RecoverFromRetryableBGIOError:BeforeWait1"}, {"RecoverFromRetryableBGIOError:RecoverSuccess", - "ManifestWriteRetryableErrorAutoRecover:2"}}); + "ManifestWriteNoWALRetryableErrorAutoRecover:2"}}); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", - [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); }); + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); - TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0"); - fault_fs->SetFilesystemActive(true); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0"); + fault_fs_->SetFilesystemActive(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1"); - TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2"); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1"); + TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2"); SyncPoint::GetInstance()->DisableProcessing(); new_manifest = GetManifestNameFromLiveFiles(); @@ -1854,13 +2073,10 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableErrorAutoRecover) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -1878,8 +2094,7 @@ TEST_F(DBErrorHandlingFSTest, ASSERT_OK(Put(Key(0), "val")); ASSERT_OK(Put(Key(2), "val")); - s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(Flush()); listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); listener->EnableAutoRecovery(false); @@ -1909,14 +2124,14 @@ TEST_F(DBErrorHandlingFSTest, ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", [&](void*) { if (fail_manifest.load()) { - fault_fs->SetFilesystemActive(false, error_msg); + fault_fs_->SetFilesystemActive(false, error_msg); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0"); TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1"); @@ -1925,7 +2140,7 @@ TEST_F(DBErrorHandlingFSTest, ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2"); TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3"); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4"); TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5"); @@ -1948,13 +2163,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { // compaction, the FS is set to active and compaction is successful, so // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync // point. - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -1969,7 +2181,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { ASSERT_OK(Put(Key(0), "va;")); ASSERT_OK(Put(Key(2), "va;")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); listener->EnableAutoRecovery(false); @@ -1980,13 +2192,13 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { "CompactionWriteRetryableErrorAutoRecover0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::BackgroundCompaction:Start", - [&](void*) { fault_fs->SetFilesystemActive(true); }); + [&](void*) { fault_fs_->SetFilesystemActive(true); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::OpenCompactionOutputFile", [&](void*) { if (fail_first.load() && fail_second.load()) { - fault_fs->SetFilesystemActive(false, error_msg); + fault_fs_->SetFilesystemActive(false, error_msg); fail_second.store(false); } }); @@ -1994,11 +2206,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); - + ASSERT_OK(s); TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0"); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); @@ -2006,13 +2217,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { } TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.writable_file_max_buffer_size = 32768; options.listeners.emplace_back(listener); @@ -2037,7 +2245,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; // For the second batch, the first 2 file Append are successful, then the @@ -2050,7 +2258,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024))); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"}, + {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"}, + {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"}, {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"}, {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}}); @@ -2058,7 +2267,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { write_error++; if (write_error > 2) { - fault_fs->SetFilesystemActive(false, error_msg); + fault_fs_->SetFilesystemActive(false, error_msg); } }); SyncPoint::GetInstance()->EnableProcessing(); @@ -2066,9 +2275,10 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { wopts.sync = true; s = dbfull()->Write(wopts, &batch); ASSERT_EQ(true, s.IsIOError()); + TEST_SYNC_POINT("WALWriteErrorDone"); TEST_SYNC_POINT("WALWriteError1:0"); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); TEST_SYNC_POINT("WALWriteError1:1"); TEST_SYNC_POINT("WALWriteError1:2"); @@ -2094,7 +2304,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; Reopen(options); @@ -2110,13 +2320,10 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { // Fail the first recover and try second time. - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.writable_file_max_buffer_size = 32768; options.listeners.emplace_back(listener); @@ -2141,7 +2348,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; // For the second batch, the first 2 file Append are successful, then the @@ -2162,7 +2369,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { write_error++; if (write_error > 2) { - fault_fs->SetFilesystemActive(false, error_msg); + fault_fs_->SetFilesystemActive(false, error_msg); } }); SyncPoint::GetInstance()->EnableProcessing(); @@ -2172,7 +2379,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { ASSERT_EQ(true, s.IsIOError()); TEST_SYNC_POINT("WALWriteError2:0"); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); TEST_SYNC_POINT("WALWriteError2:1"); TEST_SYNC_POINT("WALWriteError2:2"); @@ -2198,7 +2405,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; Reopen(options); @@ -2216,13 +2423,10 @@ class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest, public testing::WithParamInterface {}; TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.paranoid_checks = GetParam(); @@ -2233,27 +2437,24 @@ TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) { ASSERT_OK(Put(Key(0), "val")); SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); ASSERT_TRUE(s.IsIOFenced()); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); ASSERT_TRUE(s.IsIOFenced()); Destroy(options); } TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.listeners.emplace_back(listener); options.paranoid_checks = GetParam(); @@ -2266,11 +2467,11 @@ TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { old_manifest = GetManifestNameFromLiveFiles(); ASSERT_OK(Put(Key(0), "val")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put(Key(1), "val")); SyncPoint::GetInstance()->SetCallBack( "VersionSet::LogAndApply:WriteManifest", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); @@ -2278,20 +2479,17 @@ TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { ASSERT_TRUE(s.IsIOFenced()); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); ASSERT_TRUE(s.IsIOFenced()); Close(); } TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.level0_file_num_compaction_trigger = 2; options.listeners.emplace_back(listener); @@ -2302,7 +2500,7 @@ TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { ASSERT_OK(Put(Key(0), "va;")); ASSERT_OK(Put(Key(2), "va;")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); listener->EnableAutoRecovery(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( @@ -2310,32 +2508,29 @@ TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { "BackgroundCallCompaction:0"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BackgroundCallCompaction:0", [&](void*) { - fault_fs->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); + fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced")); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put(Key(1), "val")); s = Flush(); - ASSERT_EQ(s, Status::OK()); + ASSERT_OK(s); s = dbfull()->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError); ASSERT_TRUE(s.IsIOFenced()); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); s = dbfull()->Resume(); ASSERT_TRUE(s.IsIOFenced()); Destroy(options); } TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { - std::shared_ptr fault_fs( - new FaultInjectionTestFS(FileSystem::Default())); - std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); std::shared_ptr listener( new ErrorHandlerFSListener()); Options options = GetDefaultOptions(); - options.env = fault_fs_env.get(); + options.env = fault_env_.get(); options.create_if_missing = true; options.writable_file_max_buffer_size = 32768; options.listeners.emplace_back(listener); @@ -2355,7 +2550,7 @@ TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { WriteOptions wopts; wopts.sync = true; - ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK()); + ASSERT_OK(dbfull()->Write(wopts, &batch)); }; { @@ -2370,8 +2565,8 @@ TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) { write_error++; if (write_error > 2) { - fault_fs->SetFilesystemActive(false, - IOStatus::IOFenced("IO fenced")); + fault_fs_->SetFilesystemActive(false, + IOStatus::IOFenced("IO fenced")); } }); SyncPoint::GetInstance()->EnableProcessing(); @@ -2381,7 +2576,7 @@ TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { ASSERT_TRUE(s.IsIOFenced()); } SyncPoint::GetInstance()->DisableProcessing(); - fault_fs->SetFilesystemActive(true); + fault_fs_->SetFilesystemActive(true); { WriteBatch batch; diff --git a/db/event_helpers.cc b/db/event_helpers.cc index b9fa35e330d..6164dde2962 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -108,6 +108,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( table_properties.num_entries) << "num_data_blocks" << table_properties.num_data_blocks << "num_entries" << table_properties.num_entries + << "num_filter_entries" << table_properties.num_filter_entries << "num_deletions" << table_properties.num_deletions << "num_merge_operands" << table_properties.num_merge_operands << "num_range_deletions" << table_properties.num_range_deletions @@ -125,8 +126,12 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( << table_properties.compression_options << "creation_time" << table_properties.creation_time << "oldest_key_time" << table_properties.oldest_key_time << "file_creation_time" - << table_properties.file_creation_time << "db_id" - << table_properties.db_id << "db_session_id" + << table_properties.file_creation_time + << "slow_compression_estimated_data_size" + << table_properties.slow_compression_estimated_data_size + << "fast_compression_estimated_data_size" + << table_properties.fast_compression_estimated_data_size + << "db_id" << table_properties.db_id << "db_session_id" << table_properties.db_session_id; // user collected properties @@ -213,17 +218,16 @@ void EventHelpers::NotifyOnErrorRecoveryCompleted( const std::vector>& listeners, Status old_bg_error, InstrumentedMutex* db_mutex) { #ifndef ROCKSDB_LITE - if (listeners.size() == 0U) { - return; - } - db_mutex->AssertHeld(); - // release lock while notifying events - db_mutex->Unlock(); - for (auto& listener : listeners) { - listener->OnErrorRecoveryCompleted(old_bg_error); + if (listeners.size() > 0) { + db_mutex->AssertHeld(); + // release lock while notifying events + db_mutex->Unlock(); + for (auto& listener : listeners) { + listener->OnErrorRecoveryCompleted(old_bg_error); + } + db_mutex->Lock(); } old_bg_error.PermitUncheckedError(); - db_mutex->Lock(); #else (void)listeners; (void)old_bg_error; diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 624952563fe..035cb3698c3 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -24,13 +24,13 @@ class ExternalSSTFileBasicTest ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test", /*env_do_fsync=*/true) { sst_files_dir_ = dbname_ + "/sst_files/"; - fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default())); + fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_)); DestroyAndRecreateExternalSSTFilesDir(); } void DestroyAndRecreateExternalSSTFilesDir() { - DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); + ASSERT_OK(DestroyDir(env_, sst_files_dir_)); + ASSERT_OK(env_->CreateDir(sst_files_dir_)); } Status DeprecatedAddFile(const std::vector& files, @@ -162,7 +162,9 @@ class ExternalSSTFileBasicTest write_global_seqno, verify_checksums_before_ingest, true_data); } - ~ExternalSSTFileBasicTest() override { DestroyDir(env_, sst_files_dir_); } + ~ExternalSSTFileBasicTest() override { + DestroyDir(env_, sst_files_dir_).PermitUncheckedError(); + } protected: std::string sst_files_dir_; @@ -186,7 +188,7 @@ TEST_F(ExternalSSTFileBasicTest, Basic) { } ExternalSstFileInfo file1_info; Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); // Current file size should be non-zero after success write. ASSERT_GT(sst_file_writer.FileSize(), 0); @@ -202,14 +204,14 @@ TEST_F(ExternalSSTFileBasicTest, Basic) { ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName); // sst_file_writer already finished, cannot add this value s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); s = sst_file_writer.DeleteRange(Key(100), Key(200)); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); DestroyAndReopen(options); // Add file using file path s = DeprecatedAddFile({file1}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 100; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -286,7 +288,7 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) { } ExternalSstFileInfo file1_info; Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); std::string file_checksum, file_checksum_func_name; ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName( file1, &file_checksum, &file_checksum_func_name)); @@ -305,14 +307,14 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) { ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name); // sst_file_writer already finished, cannot add this value s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); s = sst_file_writer.DeleteRange(Key(100), Key(200)); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); DestroyAndReopen(options); // Add file using file path s = DeprecatedAddFile({file1}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 100; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -338,7 +340,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ExternalSstFileInfo file1_info; Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(1000)); @@ -357,7 +359,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ExternalSstFileInfo file2_info; s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 200); ASSERT_EQ(file2_info.smallest_key, Key(1100)); @@ -376,7 +378,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ExternalSstFileInfo file3_info; s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 200); ASSERT_EQ(file3_info.smallest_key, Key(1300)); @@ -395,7 +397,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ExternalSstFileInfo file4_info; s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file4_info.file_path, file4); ASSERT_EQ(file4_info.num_entries, 300); ASSERT_EQ(file4_info.smallest_key, Key(1500)); @@ -414,7 +416,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ExternalSstFileInfo file5_info; s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file5_info.file_path, file5); ASSERT_EQ(file5_info.num_entries, 200); ASSERT_EQ(file5_info.smallest_key, Key(1800)); @@ -433,7 +435,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { } ExternalSstFileInfo file6_info; s = sst_file_writer.Finish(&file6_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file6_info.file_path, file6); ASSERT_EQ(file6_info.num_entries, 200); ASSERT_EQ(file6_info.smallest_key, Key(2000)); @@ -447,7 +449,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"}, {file_checksum1}, true, false, false, false); // does not care the checksum input since db does not enable file checksum - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file1)); std::vector live_files; dbfull()->GetLiveFilesMetaData(&live_files); @@ -465,26 +467,26 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"}, {file_checksum_func_name2}, true, false, false, false); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); // Enable verify_file_checksum option // The checksum name does not match, fail the ingestion s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false, false, false); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); // Enable verify_file_checksum option // The checksum itself does not match, fail the ingestion s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2}, true, false, false, false); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); // Enable verify_file_checksum option // All matches, ingestion is successful s = AddFileWithFileChecksum({file2}, {file_checksum2}, {file_checksum_func_name2}, true, false, false, false); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); std::vector live_files1; dbfull()->GetLiveFilesMetaData(&live_files1); for (auto f : live_files1) { @@ -501,7 +503,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { std::vector checksum, checksum_func; s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false, false, false); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); std::vector live_files2; dbfull()->GetLiveFilesMetaData(&live_files2); for (auto f : live_files2) { @@ -511,20 +513,20 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { set1.insert(f.name); } } - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file3)); // Does not enable verify_file_checksum options // The checksum name does not match, fail the ingestion s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false, false, false); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); // Does not enable verify_file_checksum options // Checksum function name matches, store the checksum being ingested. s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4}, false, false, false, false); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); std::vector live_files3; dbfull()->GetLiveFilesMetaData(&live_files3); for (auto f : live_files3) { @@ -535,7 +537,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { set1.insert(f.name); } } - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file4)); // enable verify_file_checksum options, DB enable checksum, and enable @@ -544,8 +546,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { s = AddFileWithFileChecksum({file5}, {file_checksum5}, {file_checksum_func_name5}, true, false, false, true); - ASSERT_OK(s); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); std::vector live_files4; dbfull()->GetLiveFilesMetaData(&live_files4); for (auto f : live_files4) { @@ -558,7 +559,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { set1.insert(f.name); } } - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file5)); // Does not enable verify_file_checksum options and also the ingested file @@ -567,7 +568,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { std::vector files_c6, files_name6; s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false, false, false); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); std::vector live_files6; dbfull()->GetLiveFilesMetaData(&live_files6); for (auto f : live_files6) { @@ -577,7 +578,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) { set1.insert(f.name); } } - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file6)); } @@ -595,7 +596,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) { } ExternalSstFileInfo file1_info; Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(0)); @@ -609,7 +610,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) { } ExternalSstFileInfo file2_info; s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 200); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -623,23 +624,23 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) { } ExternalSstFileInfo file3_info; s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 15); ASSERT_EQ(file3_info.smallest_key, Key(110)); ASSERT_EQ(file3_info.largest_key, Key(124)); s = DeprecatedAddFile({file1}, true /* move file */); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(Status::NotFound(), env_->FileExists(file1)); s = DeprecatedAddFile({file2}, false /* copy file */); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file2)); // This file has overlapping values with the existing data s = DeprecatedAddFile({file3}, true /* move file */); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(s) << s.ToString(); ASSERT_OK(env_->FileExists(file3)); for (int k = 0; k < 300; k++) { @@ -1109,6 +1110,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { } Options sst_file_writer_options; + sst_file_writer_options.env = env_; std::unique_ptr sst_file_writer( new SstFileWriter(EnvOptions(), sst_file_writer_options)); std::string file_name = @@ -1125,7 +1127,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { if (i == 2) { ingest_opt.write_global_seqno = true; } - ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok()); + ASSERT_NOK(db_->IngestExternalFile({file_name}, ingest_opt)); db_->ReleaseSnapshot(snapshot); SyncPoint::GetInstance()->DisableProcessing(); @@ -1134,14 +1136,50 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { } } +TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) { + Options options; + options.create_if_missing = true; + options.env = env_; + + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) { + Status* s = static_cast(arg); + *s = Status::NotSupported(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + Options sst_file_writer_options; + sst_file_writer_options.env = env_; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "reopen_not_supported_test_" + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = true; + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { Options options; options.create_if_missing = true; - SpecialEnv senv(Env::Default()); + SpecialEnv senv(env_); options.env = &senv; DestroyAndReopen(options); Options sst_file_writer_options; + sst_file_writer_options.env = env_; std::unique_ptr sst_file_writer( new SstFileWriter(EnvOptions(), sst_file_writer_options)); std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst"; @@ -1324,7 +1362,7 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400))); ExternalSstFileInfo file8_info; Status s = sst_file_writer.Finish(&file8_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file8_info.file_path, file8); ASSERT_EQ(file8_info.num_entries, 0); ASSERT_EQ(file8_info.smallest_key, ""); @@ -1339,7 +1377,7 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); ExternalSstFileInfo file9_info; s = sst_file_writer.Finish(&file9_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(file9_info.file_path, file9); ASSERT_EQ(file9_info.num_entries, 0); ASSERT_EQ(file9_info.smallest_key, ""); @@ -1351,7 +1389,7 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { // Range deletion tombstones are exclusive on their end key, so these SSTs // should not be considered as overlapping. s = DeprecatedAddFile({file8, file9}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(s) << s.ToString(); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); DestroyAndRecreateExternalSSTFilesDir(); } @@ -1539,6 +1577,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { ASSERT_EQ(2, NumTableFilesAtLevel(0)); } +TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { + // Repro https://github.com/facebook/rocksdb/issues/6245. + // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction + // via trivial move. The bug happened when L1 files were incorrectly sorted + // resulting in an old value for "k" returned by `Get()`. + Options options = CurrentOptions(); + + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + ASSERT_OK(Put("k", "a")); + Flush(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open a + // file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("k", "b")); + + ExternalSstFileInfo file1_info; + Status s = sst_file_writer.Finish(&file1_info); + ASSERT_OK(s) << s.ToString(); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + IngestExternalFileOptions ifo; + s = db_->IngestExternalFile({file1}, ifo); + ASSERT_OK(s); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_EQ(Get("k"), "b"); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, testing::Values(std::make_tuple(true, true), std::make_tuple(true, false), diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 0fcaf776256..11c0155f672 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -40,16 +40,25 @@ Status ExternalSstFileIngestionJob::Prepare( if (!status.ok()) { return status; } - files_to_ingest_.push_back(file_to_ingest); - } - for (const IngestedFileInfo& f : files_to_ingest_) { - if (f.cf_id != + if (file_to_ingest.cf_id != TablePropertiesCollectorFactory::Context::kUnknownColumnFamily && - f.cf_id != cfd_->GetID()) { + file_to_ingest.cf_id != cfd_->GetID()) { return Status::InvalidArgument( "External file column family id don't match"); } + + if (file_to_ingest.num_entries == 0 && + file_to_ingest.num_range_deletions == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!file_to_ingest.smallest_internal_key.Valid() || + !file_to_ingest.largest_internal_key.Valid()) { + return Status::Corruption("Generated table have corrupted keys"); + } + + files_to_ingest_.emplace_back(std::move(file_to_ingest)); } const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); @@ -83,16 +92,6 @@ Status ExternalSstFileIngestionJob::Prepare( return Status::NotSupported("Files have overlapping ranges"); } - for (IngestedFileInfo& f : files_to_ingest_) { - if (f.num_entries == 0 && f.num_range_deletions == 0) { - return Status::InvalidArgument("File contain no entries"); - } - - if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { - return Status::Corruption("Generated table have corrupted keys"); - } - } - // Copy/Move external files into DB std::unordered_set ingestion_path_ids; for (IngestedFileInfo& f : files_to_ingest_) { @@ -111,18 +110,26 @@ Status ExternalSstFileIngestionJob::Prepare( // directory before ingest the file. For integrity of RocksDB we need // to sync the file. std::unique_ptr file_to_sync; - status = fs_->ReopenWritableFile(path_inside_db, env_options_, - &file_to_sync, nullptr); - if (status.ok()) { - TEST_SYNC_POINT( - "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); - status = SyncIngestedFile(file_to_sync.get()); - TEST_SYNC_POINT( - "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Failed to sync ingested file %s: %s", - path_inside_db.c_str(), status.ToString().c_str()); + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } } } } @@ -204,7 +211,8 @@ Status ExternalSstFileIngestionJob::Prepare( requested_checksum_func_name, &generated_checksum, &generated_checksum_func_name, ingestion_options_.verify_checksums_readahead_size, - db_options_.allow_mmap_reads, io_tracer_); + db_options_.allow_mmap_reads, io_tracer_, + db_options_.rate_limiter.get()); if (!io_s.ok()) { status = io_s; ROCKS_LOG_WARN(db_options_.info_log, @@ -295,12 +303,13 @@ Status ExternalSstFileIngestionJob::Prepare( // TODO: The following is duplicated with Cleanup(). if (!status.ok()) { + IOOptions io_opts; // We failed, remove all files that we copied into the db for (IngestedFileInfo& f : files_to_ingest_) { if (f.internal_file_path.empty()) { continue; } - Status s = env_->DeleteFile(f.internal_file_path); + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "AddFile() clean up for file %s failed : %s", @@ -319,8 +328,8 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), file_to_ingest.largest_internal_key.user_key()); } - Status status = - cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed); + Status status = cfd_->RangesOverlapWithMemtables( + ranges, super_version, db_options_.allow_data_in_errors, flush_needed); if (status.ok() && *flush_needed && !ingestion_options_.allow_blocking_flush) { status = Status::InvalidArgument("External file requires flush"); @@ -338,6 +347,12 @@ Status ExternalSstFileIngestionJob::Run() { // with the files we are ingesting bool need_flush = false; status = NeedsFlush(&need_flush, super_version); + if (!status.ok()) { + return status; + } + if (need_flush) { + return Status::TryAgain(); + } assert(status.ok() && need_flush == false); #endif @@ -363,9 +378,32 @@ Status ExternalSstFileIngestionJob::Run() { super_version, force_global_seqno, cfd_->ioptions()->compaction_style, last_seqno, &f, &assigned_seqno); } + + // Modify the smallest/largest internal key to include the sequence number + // that we just learned. Only overwrite sequence number zero. There could + // be a nonzero sequence number already to indicate a range tombstone's + // exclusive endpoint. + ParsedInternalKey smallest_parsed, largest_parsed; + if (status.ok()) { + status = ParseInternalKey(*f.smallest_internal_key.rep(), + &smallest_parsed, false /* log_err_key */); + } + if (status.ok()) { + status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, + false /* log_err_key */); + } if (!status.ok()) { return status; } + if (smallest_parsed.sequence == 0) { + UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + smallest_parsed.type); + } + if (largest_parsed.sequence == 0) { + UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + largest_parsed.type); + } + status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", &assigned_seqno); @@ -388,7 +426,7 @@ Status ExternalSstFileIngestionJob::Run() { int64_t temp_current_time = 0; uint64_t current_time = kUnknownFileCreationTime; uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; - if (env_->GetCurrentTime(&temp_current_time).ok()) { + if (clock_->GetCurrentTime(&temp_current_time).ok()) { current_time = oldest_ancester_time = static_cast(temp_current_time); } @@ -406,7 +444,7 @@ void ExternalSstFileIngestionJob::UpdateStats() { // Update internal stats for new ingested files uint64_t total_keys = 0; uint64_t total_l0_files = 0; - uint64_t total_time = env_->NowMicros() - job_start_time_; + uint64_t total_time = clock_->NowMicros() - job_start_time_; EventLoggerStream stream = event_logger_->Log(); stream << "event" @@ -462,6 +500,7 @@ void ExternalSstFileIngestionJob::UpdateStats() { } void ExternalSstFileIngestionJob::Cleanup(const Status& status) { + IOOptions io_opts; if (!status.ok()) { // We failed to add the files to the database // remove all the files we copied @@ -469,7 +508,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { if (f.internal_file_path.empty()) { continue; } - Status s = env_->DeleteFile(f.internal_file_path); + Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "AddFile() clean up for file %s failed : %s", @@ -481,7 +520,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { } else if (status.ok() && ingestion_options_.move_files) { // The files were moved and added successfully, remove original file links for (IngestedFileInfo& f : files_to_ingest_) { - Status s = env_->DeleteFile(f.external_file_path); + Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr); if (!s.ok()) { ROCKS_LOG_WARN( db_options_.info_log, @@ -605,22 +644,28 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->largest_internal_key = InternalKey("", 0, ValueType::kTypeValue); bool bounds_set = false; + bool allow_data_in_errors = db_options_.allow_data_in_errors; iter->SeekToFirst(); if (iter->Valid()) { - if (ParseInternalKey(iter->key(), &key) != Status::OK()) { - return Status::Corruption("external file have corrupted keys"); + Status pik_status = + ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); } if (key.sequence != 0) { - return Status::Corruption("external file have non zero sequence number"); + return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->smallest_internal_key.SetFrom(key); iter->SeekToLast(); - if (ParseInternalKey(iter->key(), &key) != Status::OK()) { - return Status::Corruption("external file have corrupted keys"); + pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); } if (key.sequence != 0) { - return Status::Corruption("external file have non zero sequence number"); + return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->largest_internal_key.SetFrom(key); @@ -633,8 +678,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( if (range_del_iter != nullptr) { for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); range_del_iter->Next()) { - if (ParseInternalKey(range_del_iter->key(), &key) != Status::OK()) { - return Status::Corruption("external file have corrupted keys"); + Status pik_status = + ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); } RangeTombstone tombstone(key, range_del_iter->value()); @@ -797,7 +845,8 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_, &rwfile, nullptr); if (status.ok()) { - FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_); + FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_, + file_to_ingest->internal_file_path); std::string seqno_val; PutFixed64(&seqno_val, seqno); status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val, @@ -844,7 +893,7 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, &file_checksum, &file_checksum_func_name, ingestion_options_.verify_checksums_readahead_size, - db_options_.allow_mmap_reads, io_tracer_); + db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get()); if (!io_s.ok()) { return io_s; } diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index a2782f54a5e..c669089d92b 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -16,13 +16,14 @@ #include "logging/event_logger.h" #include "options/db_options.h" #include "rocksdb/db.h" -#include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "rocksdb/sst_file_writer.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { class Directories; +class SystemClock; struct IngestedFileInfo { // External file path @@ -73,13 +74,13 @@ struct IngestedFileInfo { class ExternalSstFileIngestionJob { public: ExternalSstFileIngestionJob( - Env* env, VersionSet* versions, ColumnFamilyData* cfd, + VersionSet* versions, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, const EnvOptions& env_options, SnapshotList* db_snapshots, const IngestExternalFileOptions& ingestion_options, Directories* directories, EventLogger* event_logger, const std::shared_ptr& io_tracer) - : env_(env), + : clock_(db_options.clock), fs_(db_options.fs, io_tracer), versions_(versions), cfd_(cfd), @@ -89,7 +90,7 @@ class ExternalSstFileIngestionJob { ingestion_options_(ingestion_options), directories_(directories), event_logger_(event_logger), - job_start_time_(env_->NowMicros()), + job_start_time_(clock_->NowMicros()), consumed_seqno_count_(0), io_tracer_(io_tracer) { assert(directories != nullptr); @@ -169,7 +170,7 @@ class ExternalSstFileIngestionJob { template Status SyncIngestedFile(TWritableFile* file); - Env* env_; + SystemClock* clock_; FileSystemPtr fs_; VersionSet* versions_; ColumnFamilyData* cfd_; diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 0ccaf51eddf..6f3f0b3f262 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -16,6 +16,7 @@ #include "rocksdb/sst_file_writer.h" #include "test_util/testutil.h" #include "util/random.h" +#include "util/thread_guard.h" #include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { @@ -47,8 +48,8 @@ class ExternSSTFileLinkFailFallbackTest : DBTestBase("/external_sst_file_test", /*env_do_fsync=*/true), test_env_(new ExternalSSTTestEnv(env_, true)) { sst_files_dir_ = dbname_ + "/sst_files/"; - DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); + EXPECT_EQ(DestroyDir(env_, sst_files_dir_), Status::OK()); + EXPECT_EQ(env_->CreateDir(sst_files_dir_), Status::OK()); options_ = CurrentOptions(); options_.disable_auto_compactions = true; options_.env = test_env_; @@ -79,8 +80,8 @@ class ExternalSSTFileTest } void DestroyAndRecreateExternalSSTFilesDir() { - DestroyDir(env_, sst_files_dir_); - env_->CreateDir(sst_files_dir_); + ASSERT_OK(DestroyDir(env_, sst_files_dir_)); + ASSERT_OK(env_->CreateDir(sst_files_dir_)); } Status GenerateOneExternalFile( @@ -116,7 +117,7 @@ class ExternalSSTFileTest for (const auto& entry : data) { s = sst_file_writer.Put(entry.first, entry.second); if (!s.ok()) { - sst_file_writer.Finish(); + sst_file_writer.Finish().PermitUncheckedError(); return s; } } @@ -171,7 +172,7 @@ class ExternalSSTFileTest for (auto& entry : data) { s = sst_file_writer.Put(entry.first, entry.second); if (!s.ok()) { - sst_file_writer.Finish(); + sst_file_writer.Finish().PermitUncheckedError(); return s; } } @@ -213,11 +214,10 @@ class ExternalSSTFileTest size_t num_cfs = column_families.size(); assert(ifos.size() == num_cfs); assert(data.size() == num_cfs); - Status s; std::vector args(num_cfs); for (size_t i = 0; i != num_cfs; ++i) { std::string external_file_path; - s = GenerateOneExternalFile( + Status s = GenerateOneExternalFile( options, column_families[i], data[i], file_id, sort_data, &external_file_path, true_data.size() == num_cfs ? &true_data[i] : nullptr); @@ -230,8 +230,7 @@ class ExternalSSTFileTest args[i].external_files.push_back(external_file_path); args[i].options = ifos[i]; } - s = db_->IngestExternalFiles(args); - return s; + return db_->IngestExternalFiles(args); } Status GenerateAndAddExternalFile( @@ -282,7 +281,9 @@ class ExternalSSTFileTest return db_->IngestExternalFile(files, opts); } - ~ExternalSSTFileTest() override { DestroyDir(env_, sst_files_dir_); } + ~ExternalSSTFileTest() override { + DestroyDir(env_, sst_files_dir_).PermitUncheckedError(); + } protected: int last_file_id_ = 0; @@ -305,8 +306,7 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file1_info)); // Current file size should be non-zero after success write. ASSERT_GT(sst_file_writer.FileSize(), 0); @@ -319,8 +319,7 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_EQ(file1_info.smallest_range_del_key, ""); ASSERT_EQ(file1_info.largest_range_del_key, ""); // sst_file_writer already finished, cannot add this value - s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val")); // file2.sst (100 => 199) std::string file2 = sst_files_dir_ + "file2.sst"; @@ -329,11 +328,9 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } // Cannot add this key because it's not after last added key - s = sst_file_writer.Put(Key(99), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val")); ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file2_info)); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 100); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -347,9 +344,8 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); + ASSERT_OK(sst_file_writer.Finish(&file3_info)); - ASSERT_TRUE(s.ok()) << s.ToString(); // Current file size should be non-zero after success finish. ASSERT_GT(sst_file_writer.FileSize(), 0); ASSERT_EQ(file3_info.file_path, file3); @@ -365,8 +361,7 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file4_info; - s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file4_info)); ASSERT_EQ(file4_info.file_path, file4); ASSERT_EQ(file4_info.num_entries, 10); ASSERT_EQ(file4_info.smallest_key, Key(30)); @@ -379,8 +374,7 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file5_info; - s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file5_info)); ASSERT_EQ(file5_info.file_path, file5); ASSERT_EQ(file5_info.num_entries, 100); ASSERT_EQ(file5_info.smallest_key, Key(400)); @@ -389,10 +383,9 @@ TEST_F(ExternalSSTFileTest, Basic) { // file6.sst (delete 400 => 500) std::string file6 = sst_files_dir_ + "file6.sst"; ASSERT_OK(sst_file_writer.Open(file6)); - sst_file_writer.DeleteRange(Key(400), Key(500)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); ExternalSstFileInfo file6_info; - s = sst_file_writer.Finish(&file6_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file6_info)); ASSERT_EQ(file6_info.file_path, file6); ASSERT_EQ(file6_info.num_entries, 0); ASSERT_EQ(file6_info.smallest_key, ""); @@ -404,17 +397,16 @@ TEST_F(ExternalSSTFileTest, Basic) { // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2) std::string file7 = sst_files_dir_ + "file7.sst"; ASSERT_OK(sst_file_writer.Open(file7)); - sst_file_writer.DeleteRange(Key(500), Key(550)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550))); for (int k = 520; k < 560; k += 2) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } - sst_file_writer.DeleteRange(Key(525), Key(575)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575))); for (int k = 560; k < 600; k += 2) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file7_info; - s = sst_file_writer.Finish(&file7_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file7_info)); ASSERT_EQ(file7_info.file_path, file7); ASSERT_EQ(file7_info.num_entries, 40); ASSERT_EQ(file7_info.smallest_key, Key(520)); @@ -426,10 +418,9 @@ TEST_F(ExternalSSTFileTest, Basic) { // file8.sst (delete 600 => 700) std::string file8 = sst_files_dir_ + "file8.sst"; ASSERT_OK(sst_file_writer.Open(file8)); - sst_file_writer.DeleteRange(Key(600), Key(700)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700))); ExternalSstFileInfo file8_info; - s = sst_file_writer.Finish(&file8_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file8_info)); ASSERT_EQ(file8_info.file_path, file8); ASSERT_EQ(file8_info.num_entries, 0); ASSERT_EQ(file8_info.smallest_key, ""); @@ -441,13 +432,11 @@ TEST_F(ExternalSSTFileTest, Basic) { // Cannot create an empty sst file std::string file_empty = sst_files_dir_ + "file_empty.sst"; ExternalSstFileInfo file_empty_info; - s = sst_file_writer.Finish(&file_empty_info); - ASSERT_NOK(s); + ASSERT_NOK(sst_file_writer.Finish(&file_empty_info)); DestroyAndReopen(options); // Add file using file path - s = DeprecatedAddFile({file1}); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(DeprecatedAddFile({file1})); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 100; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -468,12 +457,10 @@ TEST_F(ExternalSSTFileTest, Basic) { } // This file has overlapping values with the existing data - s = DeprecatedAddFile({file3}); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile({file3})); // This file has overlapping values with the existing data - s = DeprecatedAddFile({file4}); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile({file4})); // Overwrite values of keys divisible by 5 for (int k = 0; k < 200; k += 5) { @@ -485,8 +472,7 @@ TEST_F(ExternalSSTFileTest, Basic) { ASSERT_OK(DeprecatedAddFile({file5})); // This file has overlapping values with the existing data - s = DeprecatedAddFile({file6}); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile({file6})); // Key range of file7 (500 => 598) don't overlap with any keys in DB ASSERT_OK(DeprecatedAddFile({file7})); @@ -614,15 +600,13 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file1_info)); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(0)); ASSERT_EQ(file1_info.largest_key, Key(99)); // sst_file_writer already finished, cannot add this value - s = sst_file_writer.Put(Key(100), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val")); // file2.sst (100 => 199) std::string file2 = sst_files_dir_ + "file2.sst"; @@ -631,11 +615,9 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } // Cannot add this key because it's not after last added key - s = sst_file_writer.Put(Key(99), "bad_val"); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val")); ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file2_info)); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 100); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -649,8 +631,7 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file3_info)); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 5); ASSERT_EQ(file3_info.smallest_key, Key(195)); @@ -664,8 +645,7 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap")); } ExternalSstFileInfo file4_info; - s = sst_file_writer.Finish(&file4_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file4_info)); ASSERT_EQ(file4_info.file_path, file4); ASSERT_EQ(file4_info.num_entries, 10); ASSERT_EQ(file4_info.smallest_key, Key(30)); @@ -678,8 +658,7 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file5_info; - s = sst_file_writer.Finish(&file5_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file5_info)); ASSERT_EQ(file5_info.file_path, file5); ASSERT_EQ(file5_info.num_entries, 100); ASSERT_EQ(file5_info.smallest_key, Key(200)); @@ -691,8 +670,7 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75))); ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100))); ExternalSstFileInfo file6_info; - s = sst_file_writer.Finish(&file6_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file6_info)); ASSERT_EQ(file6_info.file_path, file6); ASSERT_EQ(file6_info.num_entries, 0); ASSERT_EQ(file6_info.smallest_key, ""); @@ -706,8 +684,7 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_OK(sst_file_writer.Open(file7)); ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201))); ExternalSstFileInfo file7_info; - s = sst_file_writer.Finish(&file7_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file7_info)); ASSERT_EQ(file7_info.file_path, file7); ASSERT_EQ(file7_info.num_entries, 0); ASSERT_EQ(file7_info.smallest_key, ""); @@ -727,17 +704,13 @@ TEST_F(ExternalSSTFileTest, AddList) { DestroyAndReopen(options); // These lists of files have key ranges that overlap with each other - s = DeprecatedAddFile(file_list1); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(file_list1)); // Both of the following overlap on the range deletion tombstone. - s = DeprecatedAddFile(file_list4); - ASSERT_FALSE(s.ok()) << s.ToString(); - s = DeprecatedAddFile(file_list5); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(file_list4)); + ASSERT_NOK(DeprecatedAddFile(file_list5)); // Add files using file path list - s = DeprecatedAddFile(file_list0); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(DeprecatedAddFile(file_list0)); ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); for (int k = 0; k < 200; k++) { ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); @@ -778,8 +751,7 @@ TEST_F(ExternalSSTFileTest, AddList) { } // This file list has overlapping values with the existing data - s = DeprecatedAddFile(file_list3); - ASSERT_FALSE(s.ok()) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(file_list3)); // Overwrite values of keys divisible by 5 for (int k = 0; k < 200; k += 5) { @@ -847,16 +819,14 @@ TEST_F(ExternalSSTFileTest, AddListAtomicity) { for (int k = i * 100; k < (i + 1) * 100; k++) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } - Status s = sst_file_writer.Finish(&files_info[i]); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&files_info[i])); ASSERT_EQ(files_info[i].file_path, files[i]); ASSERT_EQ(files_info[i].num_entries, 100); ASSERT_EQ(files_info[i].smallest_key, Key(i * 100)); ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1)); } files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst"); - auto s = DeprecatedAddFile(files); - ASSERT_NOK(s) << s.ToString(); + ASSERT_NOK(DeprecatedAddFile(files)); for (int k = 0; k < n * 100; k++) { ASSERT_EQ("NOT_FOUND", Get(Key(k))); } @@ -878,17 +848,14 @@ TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) { // file1.sst (0 => 500) std::string sst_file_path = sst_files_dir_ + "file1.sst"; - Status s = sst_file_writer.Open(sst_file_path); - ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Open(sst_file_path)); for (int i = 0; i < 500; i++) { std::string k = Key(i); - s = sst_file_writer.Put(k, k + "_val"); - ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Put(k, k + "_val")); } ExternalSstFileInfo sst_file_info; - s = sst_file_writer.Finish(&sst_file_info); - ASSERT_OK(s); + ASSERT_OK(sst_file_writer.Finish(&sst_file_info)); options.delete_obsolete_files_period_micros = 0; options.disable_auto_compactions = true; @@ -900,12 +867,11 @@ TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) { ASSERT_OK(Flush()); ASSERT_OK(Put("aaa", "xxx")); ASSERT_OK(Flush()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - s = DeprecatedAddFile({sst_file_path}); - ASSERT_OK(s); + ASSERT_OK(DeprecatedAddFile({sst_file_path})); for (int i = 0; i < 500; i++) { std::string k = Key(i); @@ -928,8 +894,7 @@ TEST_F(ExternalSSTFileTest, SkipSnapshot) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file1_info; - Status s = sst_file_writer.Finish(&file1_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file1_info)); ASSERT_EQ(file1_info.file_path, file1); ASSERT_EQ(file1_info.num_entries, 100); ASSERT_EQ(file1_info.smallest_key, Key(0)); @@ -942,8 +907,7 @@ TEST_F(ExternalSSTFileTest, SkipSnapshot) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file2_info; - s = sst_file_writer.Finish(&file2_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file2_info)); ASSERT_EQ(file2_info.file_path, file2); ASSERT_EQ(file2_info.num_entries, 200); ASSERT_EQ(file2_info.smallest_key, Key(100)); @@ -972,8 +936,7 @@ TEST_F(ExternalSSTFileTest, SkipSnapshot) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); } ExternalSstFileInfo file3_info; - s = sst_file_writer.Finish(&file3_info); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish(&file3_info)); ASSERT_EQ(file3_info.file_path, file3); ASSERT_EQ(file3_info.num_entries, 100); ASSERT_EQ(file3_info.smallest_key, Key(300)); @@ -1019,8 +982,7 @@ TEST_F(ExternalSSTFileTest, MultiThreaded) { ASSERT_OK(sst_file_writer.Put(Key(k), Key(k))); } - Status s = sst_file_writer.Finish(); - ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_OK(sst_file_writer.Finish()); }; // Write num_files files in parallel std::vector sst_writer_threads; @@ -1082,8 +1044,7 @@ TEST_F(ExternalSSTFileTest, MultiThreaded) { // Overwrite values of keys divisible by 100 for (int k = 0; k < num_files * keys_per_file; k += 100) { std::string key = Key(k); - Status s = Put(key, key + "_new"); - ASSERT_TRUE(s.ok()); + ASSERT_OK(Put(key, key + "_new")); } for (int i = 0; i < 2; i++) { @@ -1167,7 +1128,8 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) { // Generate the file containing the range std::string file_name = sst_files_dir_ + env_->GenerateUniqueId(); - ASSERT_OK(sst_file_writer.Open(file_name)); + s = sst_file_writer.Open(file_name); + ASSERT_OK(s); for (int k = range_start; k <= range_end; k++) { s = sst_file_writer.Put(Key(k), range_val); ASSERT_OK(s); @@ -1212,10 +1174,10 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) { // Flush / Compact the DB if (i && i % 50 == 0) { - Flush(); + ASSERT_OK(Flush()); } if (i && i % 75 == 0) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } } @@ -1293,7 +1255,7 @@ TEST_P(ExternalSSTFileTest, PickedLevel) { // Hold compaction from finishing TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2"); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); EXPECT_EQ(FilesPerLevel(), "1,1,1,2"); size_t kcnt = 0; @@ -1329,8 +1291,11 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) { // We have 2 overlapping files in L0 EXPECT_EQ(FilesPerLevel(), "2"); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"}, + {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", + "ExternalSSTFileTest::PickedLevelBug:0"}, {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"}, {"ExternalSSTFileTest::PickedLevelBug:2", "DBImpl::RunManualCompaction:0"}, @@ -1344,37 +1309,47 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - // While writing the MANIFEST start a thread that will ask for compaction - ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() { - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - }); - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); + Status bg_compact_status; + Status bg_addfile_status; - // Start a thread that will ingest a new file - ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() { - file_keys = {1, 2, 3}; - ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1)); - }); + { + // While writing the MANIFEST start a thread that will ask for compaction + ThreadGuard bg_compact(port::Thread([&]() { + bg_compact_status = + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + })); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); + + // Start a thread that will ingest a new file + ThreadGuard bg_addfile(port::Thread([&]() { + file_keys = {1, 2, 3}; + bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1); + })); + + // Wait for AddFile to start picking levels and writing MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); - // Wait for AddFile to start picking levels and writing MANIFEST - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); + // We need to verify that no compactions can run while AddFile is + // ingesting the files into the levels it find suitable. So we will + // wait for 2 seconds to give a chance for compactions to run during + // this period, and then make sure that no compactions where able to run + env_->SleepForMicroseconds(1000000 * 2); + bool bg_compact_started_tmp = bg_compact_started.load(); - // We need to verify that no compactions can run while AddFile is - // ingesting the files into the levels it find suitable. So we will - // wait for 2 seconds to give a chance for compactions to run during - // this period, and then make sure that no compactions where able to run - env_->SleepForMicroseconds(1000000 * 2); - ASSERT_FALSE(bg_compact_started.load()); + // Hold AddFile from finishing writing the MANIFEST + TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); - // Hold AddFile from finishing writing the MANIFEST - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); + // check the status at the end, so even if the ASSERT fails the threads + // could be joined and return. + ASSERT_FALSE(bg_compact_started_tmp); + } - bg_addfile.join(); - bg_compact.join(); + ASSERT_OK(bg_addfile_status); + ASSERT_OK(bg_compact_status); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); int total_keys = 0; Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1411,7 +1386,7 @@ TEST_F(ExternalSSTFileTest, IngestNonExistingFile) { // After full compaction, there should be only 1 file. std::vector files; - env_->GetChildren(dbname_, &files); + ASSERT_OK(env_->GetChildren(dbname_, &files)); int num_sst_files = 0; for (auto& f : files) { uint64_t number; @@ -1533,7 +1508,7 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) { TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2"); // Output of the compaction will go to L3 - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); EXPECT_EQ(FilesPerLevel(), "1,0,0,2"); Close(); @@ -1675,7 +1650,7 @@ TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) { cro.exclusive_manual_compaction = false; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -1727,9 +1702,9 @@ TEST_F(ExternalSSTFileTest, WithUnorderedWrite) { Options options = CurrentOptions(); options.unordered_write = true; DestroyAndReopen(options); - Put("foo", "v1"); + ASSERT_OK(Put("foo", "v1")); SyncPoint::GetInstance()->EnableProcessing(); - port::Thread writer([&]() { Put("bar", "v2"); }); + port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); }); TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"); ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1, @@ -1778,7 +1753,7 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) { } size_t kcnt = 0; VerifyDBFromMap(true_data, &kcnt, false); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); VerifyDBFromMap(true_data, &kcnt, false); } } @@ -1862,8 +1837,8 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) { ASSERT_OK(Put(Key(k), "memtable")); true_data[Key(k)] = "memtable"; } - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); bool write_global_seqno = std::get<0>(GetParam()); @@ -1872,40 +1847,40 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) { ASSERT_OK(GenerateAndAddExternalFile( options, {90, 100, 110}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); // This file will flush the memtable ASSERT_OK(GenerateAndAddExternalFile( options, {19, 20, 21}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_EQ(entries_in_memtable, 0); for (int k : {200, 201, 205, 206}) { ASSERT_OK(Put(Key(k), "memtable")); true_data[Key(k)] = "memtable"; } - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); // No need for flush, this file keys fit between the memtable keys ASSERT_OK(GenerateAndAddExternalFile( options, {202, 203, 204}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_GE(entries_in_memtable, 1); // This file will flush the memtable ASSERT_OK(GenerateAndAddExternalFile( options, {206, 207}, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, - &entries_in_memtable); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable, + &entries_in_memtable)); ASSERT_EQ(entries_in_memtable, 0); size_t kcnt = 0; @@ -2303,7 +2278,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) { ASSERT_OK(Put(Key(i), "memtable")); true_data[Key(i)] = "memtable"; } - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Universal picker should go at second from the bottom level ASSERT_EQ("0,1", FilesPerLevel()); ASSERT_OK(GenerateAndAddExternalFile( @@ -2317,7 +2292,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) { verify_checksums_before_ingest, true /*ingest_behind*/, false /*sort_data*/, &true_data)); ASSERT_EQ("0,1,1", FilesPerLevel()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // bottom level should be empty ASSERT_EQ("0,1", FilesPerLevel()); @@ -2465,9 +2440,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_OK(s); + ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); Close(); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, options); @@ -2648,9 +2622,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) { std::vector> true_data( column_families.size()); port::Thread ingest_thread([&]() { - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_NOK(s); + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); }); TEST_SYNC_POINT( "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:" @@ -2718,9 +2691,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) { std::vector> true_data( column_families.size()); port::Thread ingest_thread([&]() { - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_NOK(s); + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); }); TEST_SYNC_POINT( "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:" @@ -2793,9 +2765,8 @@ TEST_P(ExternalSSTFileTest, std::vector> true_data( column_families.size()); port::Thread ingest_thread([&]() { - Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data, - -1, true, true_data); - ASSERT_NOK(s); + ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data, + -1, true, true_data)); }); TEST_SYNC_POINT( "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_" @@ -2806,7 +2777,7 @@ TEST_P(ExternalSSTFileTest, "PartialManifestWriteFail:1"); ingest_thread.join(); - fault_injection_env->DropUnsyncedFileData(); + ASSERT_OK(fault_injection_env->DropUnsyncedFileData()); fault_injection_env->SetFilesystemActive(true); Close(); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, @@ -2841,7 +2812,7 @@ TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) { // sure that it won't enter the 2nd writer queue for the second time. std::vector> data; data.push_back(std::make_pair("1001", "v2")); - GenerateAndAddExternalFile(options, data); + ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true)); } TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 7e208bbf918..1a3715e3213 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -16,7 +16,6 @@ #include "db/version_set.h" #include "env/mock_env.h" #include "file/filename.h" -#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -193,7 +192,7 @@ class FaultInjectionTest for (int i = start_idx; i < start_idx + num_vals; i++) { Slice key = Key(i, &key_space); batch.Clear(); - batch.Put(key, Value(i, &value_space)); + ASSERT_OK(batch.Put(key, Value(i, &value_space))); ASSERT_OK(db_->Write(write_options, &batch)); } } @@ -273,12 +272,12 @@ class FaultInjectionTest for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); } - + ASSERT_OK(iter->status()); delete iter; FlushOptions flush_options; flush_options.wait = true; - db_->Flush(flush_options); + ASSERT_OK(db_->Flush(flush_options)); } // rnd cannot be null for kResetDropRandomUnsyncedData @@ -311,7 +310,7 @@ class FaultInjectionTest Build(write_options, 0, num_pre_sync); if (sync_use_compact_) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } write_options.sync = false; Build(write_options, num_pre_sync, num_post_sync); @@ -343,7 +342,7 @@ class FaultInjectionTest } void WaitCompactionFinish() { - static_cast(db_->GetRootDB())->TEST_WaitForCompact(); + ASSERT_OK(static_cast(db_->GetRootDB())->TEST_WaitForCompact()); ASSERT_OK(db_->Put(WriteOptions(), "", "")); } }; @@ -410,7 +409,7 @@ TEST_P(FaultInjectionTest, WriteOptionSyncTest) { write_options.sync = true; ASSERT_OK( db_->Put(write_options, Key(2, &key_space), Value(2, &value_space))); - db_->FlushWAL(false); + ASSERT_OK(db_->FlushWAL(false)); env_->SetFilesystemActive(false); NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); @@ -451,7 +450,7 @@ TEST_P(FaultInjectionTest, UninstalledCompaction) { Build(WriteOptions(), 0, kNumKeys); FlushOptions flush_options; flush_options.wait = true; - db_->Flush(flush_options); + ASSERT_OK(db_->Flush(flush_options)); ASSERT_OK(db_->Put(WriteOptions(), "", "")); TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0"); TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1"); @@ -522,9 +521,9 @@ TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) { wo.sync = true; wo.disableWAL = false; WriteBatch batch; - batch.Put("cats", "dogs"); + ASSERT_OK(batch.Put("cats", "dogs")); batch.MarkWalTerminationPoint(); - batch.Put("boys", "girls"); + ASSERT_OK(batch.Put("boys", "girls")); ASSERT_OK(db_->Write(wo, &batch)); env_->SetFilesystemActive(false); diff --git a/db/filename_test.cc b/db/filename_test.cc index 1e53c952002..d166876ba00 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -10,7 +10,6 @@ #include "file/filename.h" #include "db/dbformat.h" -#include "logging/logging.h" #include "port/port.h" #include "test_util/testharness.h" @@ -35,23 +34,23 @@ TEST_F(FileNameTest, Parse) { FileType type; char mode; } cases[] = { - {"100.log", 100, kLogFile, kAllMode}, - {"0.log", 0, kLogFile, kAllMode}, - {"0.sst", 0, kTableFile, kAllMode}, - {"CURRENT", 0, kCurrentFile, kAllMode}, - {"LOCK", 0, kDBLockFile, kAllMode}, - {"MANIFEST-2", 2, kDescriptorFile, kAllMode}, - {"MANIFEST-7", 7, kDescriptorFile, kAllMode}, - {"METADB-2", 2, kMetaDatabase, kAllMode}, - {"METADB-7", 7, kMetaDatabase, kAllMode}, - {"LOG", 0, kInfoLogFile, kDefautInfoLogDir}, - {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir}, - {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir}, - {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir}, - {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir}, - {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir}, - {"18446744073709551615.log", 18446744073709551615ull, kLogFile, - kAllMode}, }; + {"100.log", 100, kWalFile, kAllMode}, + {"0.log", 0, kWalFile, kAllMode}, + {"0.sst", 0, kTableFile, kAllMode}, + {"CURRENT", 0, kCurrentFile, kAllMode}, + {"LOCK", 0, kDBLockFile, kAllMode}, + {"MANIFEST-2", 2, kDescriptorFile, kAllMode}, + {"MANIFEST-7", 7, kDescriptorFile, kAllMode}, + {"METADB-2", 2, kMetaDatabase, kAllMode}, + {"METADB-7", 7, kMetaDatabase, kAllMode}, + {"LOG", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir}, + {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir}, + {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir}, + {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir}, + {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode}, + }; for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) { for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir"); @@ -142,7 +141,7 @@ TEST_F(FileNameTest, Construction) { ASSERT_EQ("foo/", std::string(fname.data(), 4)); ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); ASSERT_EQ(192U, number); - ASSERT_EQ(kLogFile, type); + ASSERT_EQ(kWalFile, type); fname = TableFileName({DbPath("bar", 0)}, 200, 0); std::string fname1 = diff --git a/db/flush_job.cc b/db/flush_job.cc index 6e2a60ff903..10d6ed10868 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -75,6 +75,8 @@ const char* GetFlushReasonString (FlushReason flush_reason) { return "Manual Flush"; case FlushReason::kErrorRecovery: return "Error Recovery"; + case FlushReason::kWalFull: + return "WAL Full"; default: return "Invalid"; } @@ -83,7 +85,7 @@ const char* GetFlushReasonString (FlushReason flush_reason) { FlushJob::FlushJob( const std::string& dbname, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, - const MutableCFOptions& mutable_cf_options, const uint64_t* max_memtable_id, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, const FileOptions& file_options, VersionSet* versions, InstrumentedMutex* db_mutex, std::atomic* shutting_down, std::vector existing_snapshots, @@ -94,7 +96,8 @@ FlushJob::FlushJob( Statistics* stats, EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const std::string& db_id, const std::string& db_session_id) + const std::string& db_id, const std::string& db_session_id, + std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback) : dbname_(dbname), db_id_(db_id), db_session_id_(db_session_id), @@ -123,7 +126,10 @@ FlushJob::FlushJob( base_(nullptr), pick_memtable_called(false), thread_pri_(thread_pri), - io_tracer_(io_tracer) { + io_tracer_(io_tracer), + clock_(db_options_.clock), + full_history_ts_low_(std::move(full_history_ts_low)), + blob_callback_(blob_callback) { // Update the thread status to indicate flush. ReportStartedFlush(); TEST_SYNC_POINT("FlushJob::FlushJob()"); @@ -160,7 +166,6 @@ void FlushJob::RecordFlushIOStats() { ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); } - void FlushJob::PickMemTable() { db_mutex_->AssertHeld(); assert(!pick_memtable_called); @@ -305,8 +310,8 @@ Status FlushJob::WriteLevel0Table() { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_FLUSH_WRITE_L0); db_mutex_->AssertHeld(); - const uint64_t start_micros = db_options_.env->NowMicros(); - const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000; + const uint64_t start_micros = clock_->NowMicros(); + const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000; Status s; std::vector blob_file_additions; @@ -367,7 +372,7 @@ Status FlushJob::WriteLevel0Table() { TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", &output_compression_); int64_t _current_time = 0; - auto status = db_options_.env->GetCurrentTime(&_current_time); + auto status = clock_->GetCurrentTime(&_current_time); // Safe to proceed even if GetCurrentTime fails. So, log and proceed. if (!status.ok()) { ROCKS_LOG_WARN( @@ -397,25 +402,49 @@ Status FlushJob::WriteLevel0Table() { ? current_time : meta_.oldest_ancester_time; + uint64_t num_input_entries = 0; + uint64_t memtable_payload_bytes = 0; + uint64_t memtable_garbage_bytes = 0; IOStatus io_s; + const std::string* const full_history_ts_low = + (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_; + TableBuilderOptions tboptions( + *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), output_compression_, + mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(), + 0 /* level */, false /* is_bottommost */, + TableFileCreationReason::kFlush, creation_time, oldest_key_time, + current_time, db_id_, db_session_id_, 0 /* target_file_size */, + meta_.fd.GetNumber()); s = BuildTable( - dbname_, versions_, db_options_.env, db_options_.fs.get(), - *cfd_->ioptions(), mutable_cf_options_, file_options_, + dbname_, versions_, db_options_, tboptions, file_options_, cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, - &blob_file_additions, cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(), - cfd_->GetName(), existing_snapshots_, + &blob_file_additions, existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, - output_compression_, mutable_cf_options_.sample_for_compression, - mutable_cf_options_.compression_opts, mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), - TableFileCreationReason::kFlush, &io_s, io_tracer_, event_logger_, - job_context_->job_id, Env::IO_HIGH, &table_properties_, 0 /* level */, - creation_time, oldest_key_time, write_hint, current_time, db_id_, - db_session_id_); + &io_s, io_tracer_, event_logger_, job_context_->job_id, Env::IO_HIGH, + &table_properties_, write_hint, full_history_ts_low, blob_callback_, + &num_input_entries, &memtable_payload_bytes, &memtable_garbage_bytes); if (!io_s.ok()) { io_status_ = io_s; } + if (num_input_entries != total_num_entries && s.ok()) { + std::string msg = "Expected " + ToString(total_num_entries) + + " entries in memtables, but read " + + ToString(num_input_entries); + ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s", + cfd_->GetName().c_str(), job_context_->job_id, + msg.c_str()); + if (db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } + if (tboptions.reason == TableFileCreationReason::kFlush) { + RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + memtable_payload_bytes); + RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + memtable_garbage_bytes); + } LogFlush(db_options_.info_log); } ROCKS_LOG_INFO(db_options_.info_log, @@ -438,7 +467,6 @@ Status FlushJob::WriteLevel0Table() { // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. const bool has_output = meta_.fd.GetFileSize() > 0; - assert(has_output || blob_file_additions.empty()); if (s.ok() && has_output) { // if we have more than 1 background thread, then we cannot @@ -462,24 +490,26 @@ Status FlushJob::WriteLevel0Table() { // Note that here we treat flush as level 0 compaction in internal stats InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); - stats.micros = db_options_.env->NowMicros() - start_micros; - stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros; + stats.micros = clock_->NowMicros() - start_micros; + stats.cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros; if (has_output) { stats.bytes_written = meta_.fd.GetFileSize(); + stats.num_output_files = 1; + } - const auto& blobs = edit_->GetBlobFileAdditions(); - for (const auto& blob : blobs) { - stats.bytes_written += blob.GetTotalBlobBytes(); - } - - stats.num_output_files = static_cast(blobs.size()) + 1; + const auto& blobs = edit_->GetBlobFileAdditions(); + for (const auto& blob : blobs) { + stats.bytes_written_blob += blob.GetTotalBlobBytes(); } + stats.num_output_files_blob = static_cast(blobs.size()); + RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros); cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats); - cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, - stats.bytes_written); + cfd_->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, + stats.bytes_written + stats.bytes_written_blob); RecordFlushIOStats(); return s; } diff --git a/db/flush_job.h b/db/flush_job.h index b724b2464d2..ff2ad85bcab 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -17,6 +17,7 @@ #include #include +#include "db/blob/blob_file_completion_callback.h" #include "db/column_family.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" @@ -60,10 +61,9 @@ class FlushJob { // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive FlushJob(const std::string& dbname, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, - const MutableCFOptions& mutable_cf_options, - const uint64_t* max_memtable_id, const FileOptions& file_options, - VersionSet* versions, InstrumentedMutex* db_mutex, - std::atomic* shutting_down, + const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id, + const FileOptions& file_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, SnapshotChecker* snapshot_checker, JobContext* job_context, @@ -73,8 +73,9 @@ class FlushJob { EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const std::string& db_id = "", - const std::string& db_session_id = ""); + const std::string& db_id = "", const std::string& db_session_id = "", + std::string full_history_ts_low = "", + BlobFileCompletionCallback* blob_callback = nullptr); ~FlushJob(); @@ -110,12 +111,11 @@ class FlushJob { ColumnFamilyData* cfd_; const ImmutableDBOptions& db_options_; const MutableCFOptions& mutable_cf_options_; - // Pointer to a variable storing the largest memtable id to flush in this + // A variable storing the largest memtable id to flush in this // flush job. RocksDB uses this variable to select the memtables to flush in // this job. All memtables in this column family with an ID smaller than or - // equal to *max_memtable_id_ will be selected for flush. If null, then all - // memtables in the column family will be selected. - const uint64_t* max_memtable_id_; + // equal to max_memtable_id_ will be selected for flush. + uint64_t max_memtable_id_; const FileOptions file_options_; VersionSet* versions_; InstrumentedMutex* db_mutex_; @@ -164,6 +164,10 @@ class FlushJob { IOStatus io_status_; const std::shared_ptr io_tracer_; + SystemClock* clock_; + + const std::string full_history_ts_low_; + BlobFileCompletionCallback* blob_callback_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 618594b2d1e..2366da201e1 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -16,6 +16,7 @@ #include "db/version_set.h" #include "file/writable_file_writer.h" #include "rocksdb/cache.h" +#include "rocksdb/file_system.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" #include "test_util/testharness.h" @@ -28,49 +29,35 @@ namespace ROCKSDB_NAMESPACE { // TODO(icanadi) Mock out everything else: // 1. VersionSet // 2. Memtable -class FlushJobTest : public testing::Test { - public: - FlushJobTest() +class FlushJobTestBase : public testing::Test { + protected: + FlushJobTestBase(std::string dbname, const Comparator* ucmp) : env_(Env::Default()), - fs_(std::make_shared(env_)), - dbname_(test::PerThreadDBPath("flush_job_test")), + fs_(env_->GetFileSystem()), + dbname_(std::move(dbname)), + ucmp_(ucmp), options_(), db_options_(options_), column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), shutting_down_(false), - mock_table_factory_(new mock::MockTableFactory()) { - EXPECT_OK(env_->CreateDirIfMissing(dbname_)); - db_options_.db_paths.emplace_back(dbname_, - std::numeric_limits::max()); - db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - // TODO(icanadi) Remove this once we mock out VersionSet - NewDB(); - std::vector column_families; - cf_options_.table_factory = mock_table_factory_; - for (const auto& cf_name : column_family_names_) { - column_families.emplace_back(cf_name, cf_options_); - } + mock_table_factory_(new mock::MockTableFactory()) {} - db_options_.env = env_; - db_options_.fs = fs_; - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)); - EXPECT_OK(versions_->Recover(column_families, false)); + virtual ~FlushJobTestBase() { + if (getenv("KEEP_DB")) { + fprintf(stdout, "db is still in %s\n", dbname_.c_str()); + } else { + // destroy versions_ to release all file handles + versions_.reset(); + EXPECT_OK(DestroyDir(env_, dbname_)); + } } void NewDB() { - SetIdentityFile(env_, dbname_); + ASSERT_OK(SetIdentityFile(env_, dbname_)); VersionEdit new_db; - if (db_options_.write_dbid_to_manifest) { - DBImpl* impl = new DBImpl(DBOptions(), dbname_); - std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); - new_db.SetDBId(db_id); - } + new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); @@ -82,6 +69,7 @@ class FlushJobTest : public testing::Test { VersionEdit new_cf; new_cf.AddColumnFamily(column_family_names_[i]); new_cf.SetColumnFamily(cf_id++); + new_cf.SetComparatorName(ucmp_->Name()); new_cf.SetLogNumber(0); new_cf.SetNextFile(2); new_cf.SetLastSequence(last_seq++); @@ -89,17 +77,19 @@ class FlushJobTest : public testing::Test { } const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions())); + { log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); + ASSERT_OK(s); for (const auto& e : new_cfs) { record.clear(); @@ -114,9 +104,38 @@ class FlushJobTest : public testing::Test { ASSERT_OK(s); } + void SetUp() override { + EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + + // TODO(icanadi) Remove this once we mock out VersionSet + NewDB(); + + db_options_.env = env_; + db_options_.fs = fs_; + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.statistics = CreateDBStatistics(); + + cf_options_.comparator = ucmp_; + + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + for (const auto& cf_name : column_family_names_) { + column_families.emplace_back(cf_name, cf_options_); + } + + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + EXPECT_OK(versions_->Recover(column_families, false)); + } + Env* env_; std::shared_ptr fs_; std::string dbname_; + const Comparator* const ucmp_; EnvOptions env_options_; Options options_; ImmutableDBOptions db_options_; @@ -131,6 +150,13 @@ class FlushJobTest : public testing::Test { std::shared_ptr mock_table_factory_; }; +class FlushJobTest : public FlushJobTestBase { + public: + FlushJobTest() + : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"), + BytewiseComparator()) {} +}; + TEST_F(FlushJobTest, Empty) { JobContext job_context(0); auto cfd = versions_->GetColumnFamilySet()->GetDefault(); @@ -138,7 +164,7 @@ TEST_F(FlushJobTest, Empty) { SnapshotChecker* snapshot_checker = nullptr; // not relavant FlushJob flush_job( dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */, + *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, nullptr, &event_logger, false, @@ -167,7 +193,8 @@ TEST_F(FlushJobTest, NonEmpty) { for (int i = 1; i < 10000; ++i) { std::string key(ToString((i + 1000) % 10000)); std::string value("value" + key); - new_mem->Add(SequenceNumber(i), kTypeValue, key, value); + ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value, + nullptr /* kv_prot_info */)); if ((i + 1000) % 10000 < 9995) { InternalKey internal_key(key, SequenceNumber(i), kTypeValue); inserted_keys.push_back({internal_key.Encode().ToString(), value}); @@ -175,7 +202,8 @@ TEST_F(FlushJobTest, NonEmpty) { } { - new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a"); + ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", + "9999a", nullptr /* kv_prot_info */)); InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion); inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"}); } @@ -202,7 +230,8 @@ TEST_F(FlushJobTest, NonEmpty) { } const SequenceNumber seq(i + 10001); - new_mem->Add(seq, kTypeBlobIndex, key, blob_index); + ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index, + nullptr /* kv_prot_info */)); InternalKey internal_key(key, seq, kTypeBlobIndex); inserted_keys.push_back({internal_key.Encode().ToString(), blob_index}); @@ -219,7 +248,7 @@ TEST_F(FlushJobTest, NonEmpty) { SnapshotChecker* snapshot_checker = nullptr; // not relavant FlushJob flush_job( dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */, + *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, @@ -263,8 +292,8 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { for (size_t j = 0; j < num_keys_per_table; ++j) { std::string key(ToString(j + i * num_keys_per_table)); std::string value("value" + key); - mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key, - value); + ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, + key, value, nullptr /* kv_prot_info */)); } } @@ -281,7 +310,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1; FlushJob flush_job( dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - *cfd->GetLatestMutableCFOptions(), &flush_memtable_id, env_options_, + *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, true, @@ -336,7 +365,8 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { for (size_t j = 0; j != num_keys_per_memtable; ++j) { std::string key(ToString(j + i * num_keys_per_memtable)); std::string value("value" + key); - mem->Add(curr_seqno++, kTypeValue, key, value); + ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value, + nullptr /* kv_prot_info */)); } cfd->imm()->Add(mem, &to_delete); @@ -353,7 +383,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { std::vector snapshot_seqs; flush_jobs.emplace_back(new FlushJob( dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), - &memtable_ids[k], env_options_, versions_.get(), &mutex_, + memtable_ids[k], env_options_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, true, @@ -391,8 +421,9 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { Status s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, - versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free, - nullptr /* db_directory */, nullptr /* log_buffer */); + versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs, + &job_context.memtables_to_free, nullptr /* db_directory */, + nullptr /* log_buffer */); ASSERT_OK(s); mutex_.Unlock(); @@ -446,7 +477,8 @@ TEST_F(FlushJobTest, Snapshots) { for (int j = 0; j < insertions; ++j) { std::string value(rnd.HumanReadableString(10)); auto seqno = ++current_seqno; - new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value); + ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value, + nullptr /* kv_prot_info */)); // a key is visible only if: // 1. it's the last one written (j == insertions - 1) // 2. there's a snapshot pointing at it @@ -470,7 +502,7 @@ TEST_F(FlushJobTest, Snapshots) { SnapshotChecker* snapshot_checker = nullptr; // not relavant FlushJob flush_job( dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, - *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */, + *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, @@ -487,6 +519,136 @@ TEST_F(FlushJobTest, Snapshots) { job_context.Clean(); } +class FlushJobTimestampTest : public FlushJobTestBase { + public: + FlushJobTimestampTest() + : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"), + test::ComparatorWithU64Ts()) {} + + void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts, + SequenceNumber seq, ValueType value_type, + Slice value) { + std::string key_str(std::move(key)); + PutFixed64(&key_str, ts); + ASSERT_OK(memtable->Add(seq, value_type, key_str, value, + nullptr /* kv_prot_info */)); + } + + protected: + static constexpr uint64_t kStartTs = 10; + static constexpr SequenceNumber kStartSeq = 0; + SequenceNumber curr_seq_{kStartSeq}; + std::atomic curr_ts_{kStartTs}; +}; + +TEST_F(FlushJobTimestampTest, AllKeysExpired) { + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + autovector to_delete; + + { + MemTable* new_mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + new_mem->Ref(); + for (int i = 0; i < 100; ++i) { + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeValue, "0_value"); + } + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeDeletionWithTimestamp, ""); + cfd->imm()->Add(new_mem, &to_delete); + } + + std::vector snapshots; + constexpr SnapshotChecker* const snapshot_checker = nullptr; + JobContext job_context(0); + EventLogger event_logger(db_options_.info_log.get()); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + FlushJob flush_job( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low); + + FileMetaData fmeta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta)); + mutex_.Unlock(); + + { + std::string key = test::EncodeInt(0); + key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp); + ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode()); + ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode()); + } + + job_context.Clean(); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(FlushJobTimestampTest, NoKeyExpired) { + ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); + autovector to_delete; + + { + MemTable* new_mem = cfd->ConstructNewMemtable( + *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); + new_mem->Ref(); + for (int i = 0; i < 100; ++i) { + uint64_t ts = curr_ts_.fetch_add(1); + SequenceNumber seq = (curr_seq_++); + AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq, + ValueType::kTypeValue, "0_value"); + } + cfd->imm()->Add(new_mem, &to_delete); + } + + std::vector snapshots; + SnapshotChecker* const snapshot_checker = nullptr; + JobContext job_context(0); + EventLogger event_logger(db_options_.info_log.get()); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 0); + FlushJob flush_job( + dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), + port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(), + &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker, + &job_context, nullptr, nullptr, nullptr, kNoCompression, + db_options_.statistics.get(), &event_logger, true, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"", + /*db_session_id=*/"", full_history_ts_low); + + FileMetaData fmeta; + mutex_.Lock(); + flush_job.PickMemTable(); + ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta)); + mutex_.Unlock(); + + { + std::string ukey = test::EncodeInt(0); + std::string smallest_key = + ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); + std::string largest_key = ukey + test::EncodeInt(kStartTs); + InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue); + InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); + ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode()); + ASSERT_EQ(largest.Encode(), fmeta.largest.Encode()); + } + job_context.Clean(); + ASSERT_TRUE(to_delete.empty()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h index cbe17994fd4..eb03f3e1142 100644 --- a/db/flush_scheduler.h +++ b/db/flush_scheduler.h @@ -5,10 +5,11 @@ #pragma once -#include #include +#include #include #include + #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 013af04e997..80dd1bb9e86 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -46,7 +46,9 @@ class ForwardLevelIterator : public InternalIterator { file_iter_(nullptr), pinned_iters_mgr_(nullptr), prefix_extractor_(prefix_extractor), - allow_unprepared_value_(allow_unprepared_value) {} + allow_unprepared_value_(allow_unprepared_value) { + status_.PermitUncheckedError(); // Allow uninitialized status through + } ~ForwardLevelIterator() override { // Reset current pointer @@ -238,6 +240,12 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, if (sv_) { RebuildIterators(false); } + + // immutable_status_ is a local aggregation of the + // status of the immutable Iterators. + // We have to PermitUncheckedError in case it is never + // used, otherwise it will fail ASSERT_STATUS_CHECKED. + immutable_status_.PermitUncheckedError(); } ForwardIterator::~ForwardIterator() { @@ -418,7 +426,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, if (seek_to_first) { l0_iters_[i]->SeekToFirst(); } else { - // If the target key passes over the larget key, we are sure Next() + // If the target key passes over the largest key, we are sure Next() // won't go over this file. if (user_comparator_->Compare(target_user_key, l0[i]->largest.user_key()) > 0) { @@ -985,9 +993,9 @@ bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters, uint32_t ForwardIterator::FindFileInRange( const std::vector& files, const Slice& internal_key, uint32_t left, uint32_t right) { - auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool { + auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool { return cfd_->internal_comparator().InternalKeyComparator::Compare( - f->largest.Encode(), key) < 0; + f->largest.Encode(), k) < 0; }; const auto &b = files.begin(); return static_cast(std::lower_bound(b + left, diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 9e9073483c1..7c8c44e4ed8 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -140,7 +140,7 @@ Status ImportColumnFamilyJob::Run() { int64_t temp_current_time = 0; uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; uint64_t current_time = kUnknownOldestAncesterTime; - if (env_->GetCurrentTime(&temp_current_time).ok()) { + if (clock_->GetCurrentTime(&temp_current_time).ok()) { current_time = oldest_ancester_time = static_cast(temp_current_time); } @@ -252,15 +252,21 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // Get first (smallest) key from file iter->SeekToFirst(); - if (ParseInternalKey(iter->key(), &key) != Status::OK()) { - return Status::Corruption("external file have corrupted keys"); + Status pik_status = + ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted Key in external file. ", + pik_status.getState()); } file_to_import->smallest_internal_key.SetFrom(key); // Get last (largest) key from file iter->SeekToLast(); - if (ParseInternalKey(iter->key(), &key) != Status::OK()) { - return Status::Corruption("external file have corrupted keys"); + pik_status = + ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted Key in external file. ", + pik_status.getState()); } file_to_import->largest_internal_key.SetFrom(key); diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h index 6cdde2473e8..3cf4eb56e89 100644 --- a/db/import_column_family_job.h +++ b/db/import_column_family_job.h @@ -9,24 +9,25 @@ #include "db/snapshot_impl.h" #include "options/db_options.h" #include "rocksdb/db.h" -#include "rocksdb/env.h" #include "rocksdb/metadata.h" #include "rocksdb/sst_file_writer.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +struct EnvOptions; +class SystemClock; // Imports a set of sst files as is into a new column family. Logic is similar // to ExternalSstFileIngestionJob. class ImportColumnFamilyJob { public: - ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd, + ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, const EnvOptions& env_options, const ImportColumnFamilyOptions& import_options, const std::vector& metadata, const std::shared_ptr& io_tracer) - : env_(env), + : clock_(db_options.clock), versions_(versions), cfd_(cfd), db_options_(db_options), @@ -59,7 +60,7 @@ class ImportColumnFamilyJob { IngestedFileInfo* file_to_import, SuperVersion* sv); - Env* env_; + SystemClock* clock_; VersionSet* versions_; ColumnFamilyData* cfd_; const ImmutableDBOptions& db_options_; diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index fd77e04c5dd..3e76db76ca2 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -16,8 +16,8 @@ class ImportColumnFamilyTest : public DBTestBase { ImportColumnFamilyTest() : DBTestBase("/import_column_family_test", /*env_do_fsync=*/true) { sst_files_dir_ = dbname_ + "/sst_files/"; - DestroyAndRecreateExternalSSTFilesDir(); export_files_dir_ = test::PerThreadDBPath(env_, "export"); + DestroyAndRecreateExternalSSTFilesDir(); import_cfh_ = nullptr; import_cfh2_ = nullptr; metadata_ptr_ = nullptr; @@ -104,9 +104,9 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { ASSERT_NE(import_cfh_, nullptr); std::string value; - db_->Get(ReadOptions(), import_cfh_, "K1", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value)); ASSERT_EQ(value, "V1"); - db_->Get(ReadOptions(), import_cfh_, "K2", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value)); ASSERT_EQ(value, "V2"); ASSERT_OK(db_->DropColumnFamily(import_cfh_)); ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); @@ -125,9 +125,9 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { ASSERT_NE(import_cfh_, nullptr); std::string value; - db_->Get(ReadOptions(), import_cfh_, "K3", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value)); ASSERT_EQ(value, "V1"); - db_->Get(ReadOptions(), import_cfh_, "K4", &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value)); ASSERT_EQ(value, "V2"); } } @@ -214,7 +214,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { for (int i = 0; i < 100; i++) { std::string value; - db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); if (i % 16 == 0) { ASSERT_EQ(value, Key(i) + "_overwrite4"); } else if (i % 4 == 0) { @@ -235,7 +235,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); for (int i = 0; i < 100; i++) { std::string value; - db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); if (i % 5 == 0) { ASSERT_EQ(value, Key(i) + "_overwrite5"); } else if (i % 16 == 0) { @@ -254,7 +254,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); for (int i = 0; i < 100; i++) { std::string value; - db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value)); if (i % 5 == 0) { ASSERT_EQ(value, Key(i) + "_overwrite5"); } else if (i % 16 == 0) { @@ -318,12 +318,12 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { std::string value1, value2; for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Get(1, Key(i)), value1); } for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); ASSERT_EQ(Get(1, Key(i)), value2); } @@ -340,16 +340,16 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); } for (int i = 25; i < 50; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite3", value1); } for (int i = 50; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite2", value1); } for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); ASSERT_EQ(Get(1, Key(i)), value2); } @@ -363,16 +363,16 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); } for (int i = 25; i < 50; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite3", value1); } for (int i = 50; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); ASSERT_EQ(Key(i) + "_overwrite2", value1); } for (int i = 0; i < 100; ++i) { - db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2)); ASSERT_EQ(Get(1, Key(i)), value2); } } @@ -424,7 +424,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { for (int i = 0; i < 100; ++i) { std::string value; - db_copy->Get(ReadOptions(), cfh, Key(i), &value); + ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value)); ASSERT_EQ(Get(1, Key(i)), value); } ASSERT_OK(db_copy->DropColumnFamily(cfh)); diff --git a/db/internal_stats.cc b/db/internal_stats.cc index afe3f71419a..5f1f06fa5a6 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -12,15 +12,20 @@ #include #include +#include #include #include #include #include #include +#include "cache/cache_entry_roles.h" +#include "cache/cache_entry_stats.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" +#include "table/block_based/cachable_entry.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -50,6 +55,8 @@ const std::map InternalStats::compaction_level_stats = {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}}, {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}}, {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}}, + {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}}, + {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}}, }; namespace { @@ -61,12 +68,14 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name, const std::string& group_by) { int written_size = snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str()); + written_size = std::min(written_size, static_cast(len)); auto hdr = [](LevelStatType t) { return InternalStats::compaction_level_stats.at(t).header_name.c_str(); }; int line_size = snprintf( buf + written_size, len - written_size, - "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", + "%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s " + "%s\n", // Note that we skip COMPACTED_FILES and merge it with Files column group_by.c_str(), hdr(LevelStatType::NUM_FILES), hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE), @@ -77,9 +86,11 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name, hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC), hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT), hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN), - hdr(LevelStatType::KEY_DROP)); + hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB), + hdr(LevelStatType::W_BLOB_GB)); written_size += line_size; + written_size = std::min(written_size, static_cast(len)); snprintf(buf + written_size, len - written_size, "%s\n", std::string(line_size, '-').c_str()); } @@ -88,10 +99,12 @@ void PrepareLevelStats(std::map* level_stats, int num_files, int being_compacted, double total_file_size, double score, double w_amp, const InternalStats::CompactionStats& stats) { - uint64_t bytes_read = - stats.bytes_read_non_output_levels + stats.bytes_read_output_level; - int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; - double elapsed = (stats.micros + 1) / kMicrosInSec; + const uint64_t bytes_read = stats.bytes_read_non_output_levels + + stats.bytes_read_output_level + + stats.bytes_read_blob; + const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob; + const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level; + const double elapsed = (stats.micros + 1) / kMicrosInSec; (*level_stats)[LevelStatType::NUM_FILES] = num_files; (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted; @@ -106,8 +119,7 @@ void PrepareLevelStats(std::map* level_stats, (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB; (*level_stats)[LevelStatType::WRITE_AMP] = w_amp; (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed; - (*level_stats)[LevelStatType::WRITE_MBPS] = - stats.bytes_written / kMB / elapsed; + (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed; (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec; (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec; (*level_stats)[LevelStatType::COMP_COUNT] = stats.count; @@ -117,6 +129,8 @@ void PrepareLevelStats(std::map* level_stats, static_cast(stats.num_input_records); (*level_stats)[LevelStatType::KEY_DROP] = static_cast(stats.num_dropped_records); + (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB; + (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB; } void PrintLevelStats(char* buf, size_t len, const std::string& name, @@ -141,7 +155,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, "%9d " /* Comp(cnt) */ "%8.3f " /* Avg(sec) */ "%7s " /* KeyIn */ - "%6s\n", /* KeyDrop */ + "%6s " /* KeyDrop */ + "%9.1f " /* Rblob(GB) */ + "%9.1f\n", /* Wblob(GB) */ name.c_str(), static_cast(stat_value.at(LevelStatType::NUM_FILES)), static_cast(stat_value.at(LevelStatType::COMPACTED_FILES)), BytesToHumanString( @@ -166,7 +182,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, .c_str(), NumberToHumanString( static_cast(stat_value.at(LevelStatType::KEY_DROP))) - .c_str()); + .c_str(), + stat_value.at(LevelStatType::R_BLOB_GB), + stat_value.at(LevelStatType::W_BLOB_GB)); } void PrintLevelStats(char* buf, size_t len, const std::string& name, @@ -207,6 +225,7 @@ static const std::string cfstats_no_file_histogram = static const std::string cf_file_histogram = "cf-file-histogram"; static const std::string dbstats = "dbstats"; static const std::string levelstats = "levelstats"; +static const std::string block_cache_entry_stats = "block-cache-entry-stats"; static const std::string num_immutable_mem_table = "num-immutable-mem-table"; static const std::string num_immutable_mem_table_flushed = "num-immutable-mem-table-flushed"; @@ -273,6 +292,8 @@ const std::string DB::Properties::kCFFileHistogram = rocksdb_prefix + cf_file_histogram; const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; +const std::string DB::Properties::kBlockCacheEntryStats = + rocksdb_prefix + block_cache_entry_stats; const std::string DB::Properties::kNumImmutableMemTable = rocksdb_prefix + num_immutable_mem_table; const std::string DB::Properties::kNumImmutableMemTableFlushed = @@ -372,14 +393,18 @@ const std::unordered_map nullptr}}, {DB::Properties::kDBStats, {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}}, + {DB::Properties::kBlockCacheEntryStats, + {false, &InternalStats::HandleBlockCacheEntryStats, nullptr, + &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}}, {DB::Properties::kSSTables, {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}}, {DB::Properties::kAggregatedTableProperties, {false, &InternalStats::HandleAggregatedTableProperties, nullptr, - nullptr, nullptr}}, + &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}}, {DB::Properties::kAggregatedTablePropertiesAtLevel, {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, - nullptr, nullptr, nullptr}}, + nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap, + nullptr}}, {DB::Properties::kNumImmutableMemTable, {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr, nullptr}}, @@ -489,6 +514,159 @@ const std::unordered_map &DBImpl::GetPropertyHandleOptionsStatistics}}, }; +InternalStats::InternalStats(int num_levels, SystemClock* clock, + ColumnFamilyData* cfd) + : db_stats_{}, + cf_stats_value_{}, + cf_stats_count_{}, + comp_stats_(num_levels), + comp_stats_by_pri_(Env::Priority::TOTAL), + file_read_latency_(num_levels), + bg_error_count_(0), + number_levels_(num_levels), + clock_(clock), + cfd_(cfd), + started_at_(clock->NowMicros()) {} + +Status InternalStats::CollectCacheEntryStats(bool foreground) { + // Lazy initialize/reference the collector. It is pinned in cache (through + // a shared_ptr) so that it does not get immediately ejected from a full + // cache, which would force a re-scan on the next GetStats. + if (!cache_entry_stats_collector_) { + Cache* block_cache; + bool ok = HandleBlockCacheStat(&block_cache); + if (ok) { + // Extract or create stats collector. + Status s = CacheEntryStatsCollector::GetShared( + block_cache, clock_, &cache_entry_stats_collector_); + if (!s.ok()) { + // Block cache likely under pressure. Scanning could make it worse, + // so skip. + return s; + } + } else { + return Status::NotFound("block cache not configured"); + } + } + assert(cache_entry_stats_collector_); + + // For "background" collections, strictly cap the collection time by + // expanding effective cache TTL. For foreground, be more aggressive about + // getting latest data. + int min_interval_seconds = foreground ? 10 : 180; + // 1/500 = max of 0.2% of one CPU thread + int min_interval_factor = foreground ? 10 : 500; + cache_entry_stats_collector_->GetStats( + &cache_entry_stats_, min_interval_seconds, min_interval_factor); + return Status::OK(); +} + +std::function +InternalStats::CacheEntryRoleStats::GetEntryCallback() { + return [&](const Slice& /*key*/, void* /*value*/, size_t charge, + Cache::DeleterFn deleter) { + auto e = role_map_.find(deleter); + size_t role_idx; + if (e == role_map_.end()) { + role_idx = static_cast(CacheEntryRole::kMisc); + } else { + role_idx = static_cast(e->second); + } + entry_counts[role_idx]++; + total_charges[role_idx] += charge; + }; +} + +void InternalStats::CacheEntryRoleStats::BeginCollection( + Cache* cache, SystemClock*, uint64_t start_time_micros) { + Clear(); + last_start_time_micros_ = start_time_micros; + ++collection_count; + role_map_ = CopyCacheDeleterRoleMap(); + std::ostringstream str; + str << cache->Name() << "@" << static_cast(cache); + cache_id = str.str(); + cache_capacity = cache->GetCapacity(); +} + +void InternalStats::CacheEntryRoleStats::EndCollection( + Cache*, SystemClock*, uint64_t end_time_micros) { + last_end_time_micros_ = end_time_micros; +} + +void InternalStats::CacheEntryRoleStats::SkippedCollection() { + ++copies_of_last_collection; +} + +uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const { + if (last_end_time_micros_ > last_start_time_micros_) { + return last_end_time_micros_ - last_start_time_micros_; + } else { + return 0U; + } +} + +std::string InternalStats::CacheEntryRoleStats::ToString( + SystemClock* clock) const { + std::ostringstream str; + str << "Block cache " << cache_id + << " capacity: " << BytesToHumanString(cache_capacity) + << " collections: " << collection_count + << " last_copies: " << copies_of_last_collection + << " last_secs: " << (GetLastDurationMicros() / 1000000.0) + << " secs_since: " + << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n"; + str << "Block cache entry stats(count,size,portion):"; + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + if (entry_counts[i] > 0) { + str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i] + << "," << BytesToHumanString(total_charges[i]) << "," + << (100.0 * total_charges[i] / cache_capacity) << "%)"; + } + } + str << "\n"; + return str.str(); +} + +void InternalStats::CacheEntryRoleStats::ToMap( + std::map* values, SystemClock* clock) const { + values->clear(); + auto& v = *values; + v["id"] = cache_id; + v["capacity"] = ROCKSDB_NAMESPACE::ToString(cache_capacity); + v["secs_for_last_collection"] = + ROCKSDB_NAMESPACE::ToString(GetLastDurationMicros() / 1000000.0); + v["secs_since_last_collection"] = ROCKSDB_NAMESPACE::ToString( + (clock->NowMicros() - last_end_time_micros_) / 1000000U); + for (size_t i = 0; i < kNumCacheEntryRoles; ++i) { + std::string role = kCacheEntryRoleToHyphenString[i]; + v["count." + role] = ROCKSDB_NAMESPACE::ToString(entry_counts[i]); + v["bytes." + role] = ROCKSDB_NAMESPACE::ToString(total_charges[i]); + v["percent." + role] = + ROCKSDB_NAMESPACE::ToString(100.0 * total_charges[i] / cache_capacity); + } +} + +bool InternalStats::HandleBlockCacheEntryStats(std::string* value, + Slice /*suffix*/) { + Status s = CollectCacheEntryStats(/*foreground*/ true); + if (!s.ok()) { + return false; + } + *value = cache_entry_stats_.ToString(clock_); + return true; +} + +bool InternalStats::HandleBlockCacheEntryStatsMap( + std::map* values, Slice /*suffix*/) { + Status s = CollectCacheEntryStats(/*foreground*/ true); + if (!s.ok()) { + return false; + } + cache_entry_stats_.ToMap(values, clock_); + return true; +} + const DBPropertyInfo* GetPropertyInfo(const Slice& property) { std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); @@ -508,11 +686,12 @@ bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, } bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info, - const Slice& /*property*/, + const Slice& property, std::map* value) { assert(value != nullptr); assert(property_info.handle_map != nullptr); - return (this->*(property_info.handle_map))(value); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_map))(value, arg); } bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, @@ -588,7 +767,7 @@ bool InternalStats::HandleStats(std::string* value, Slice suffix) { } bool InternalStats::HandleCFMapStats( - std::map* cf_stats) { + std::map* cf_stats, Slice /*suffix*/) { DumpCFMapStats(cf_stats); return true; } @@ -632,7 +811,27 @@ bool InternalStats::HandleAggregatedTableProperties(std::string* value, return true; } -bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, +static std::map MapUint64ValuesToString( + const std::map& from) { + std::map to; + for (const auto& e : from) { + to[e.first] = ToString(e.second); + } + return to; +} + +bool InternalStats::HandleAggregatedTablePropertiesMap( + std::map* values, Slice /*suffix*/) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, Slice suffix) { uint64_t level; bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); @@ -645,7 +844,24 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, if (!s.ok()) { return false; } - *value = tp->ToString(); + *values = tp->ToString(); + return true; +} + +bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap()); return true; } @@ -699,21 +915,24 @@ bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/, bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { // Current size of the active memtable - *value = cfd_->mem()->ApproximateMemoryUsage(); + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast(); return true; } bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { // Current size of the active memtable + immutable memtables - *value = cfd_->mem()->ApproximateMemoryUsage() + + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); return true; } bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { - *value = cfd_->mem()->ApproximateMemoryUsage() + + // Using ApproximateMemoryUsageFast to avoid the need for synchronization + *value = cfd_->mem()->ApproximateMemoryUsageFast() + cfd_->imm()->ApproximateMemoryUsage(); return true; } @@ -906,7 +1125,7 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, bool InternalStats::HandleBlockCacheStat(Cache** block_cache) { assert(block_cache != nullptr); - auto* table_factory = cfd_->ioptions()->table_factory; + auto* table_factory = cfd_->ioptions()->table_factory.get(); assert(table_factory != nullptr); *block_cache = table_factory->GetOptions(TableFactory::kBlockCacheOpts()); @@ -949,7 +1168,7 @@ bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, void InternalStats::DumpDBStats(std::string* value) { char buf[1000]; // DB-level stats, only available from default column family - double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + double seconds_up = (clock_->NowMicros() - started_at_ + 1) / kMicrosInSec; double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up; snprintf(buf, sizeof(buf), "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n", @@ -1033,7 +1252,7 @@ void InternalStats::DumpDBStats(std::string* value) { snprintf( buf, sizeof(buf), "Interval WAL: %s writes, %s syncs, " - "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n", + "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n", NumberToHumanString(interval_write_with_wal).c_str(), NumberToHumanString(interval_wal_synced).c_str(), interval_write_with_wal / static_cast(interval_wal_synced + 1), @@ -1071,9 +1290,10 @@ void InternalStats::DumpDBStats(std::string* value) { */ void InternalStats::DumpCFMapStats( std::map* cf_stats) { + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); CompactionStats compaction_stats_sum; std::map> levels_stats; - DumpCFMapStats(&levels_stats, &compaction_stats_sum); + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); for (auto const& level_ent : levels_stats) { auto level_str = level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first); @@ -1090,9 +1310,10 @@ void InternalStats::DumpCFMapStats( } void InternalStats::DumpCFMapStats( + const VersionStorageInfo* vstorage, std::map>* levels_stats, CompactionStats* compaction_stats_sum) { - const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); + assert(vstorage); int num_levels_to_check = (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) @@ -1133,12 +1354,14 @@ void InternalStats::DumpCFMapStats( if (level == 0) { input_bytes = curr_ingest; } else { - input_bytes = comp_stats_[level].bytes_read_non_output_levels; + input_bytes = comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; } double w_amp = (input_bytes == 0) ? 0.0 - : static_cast(comp_stats_[level].bytes_written) / + : static_cast(comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob) / input_bytes; std::map level_stats; PrepareLevelStats(&level_stats, files, files_being_compacted[level], @@ -1148,7 +1371,8 @@ void InternalStats::DumpCFMapStats( } } // Cumulative summary - double w_amp = compaction_stats_sum->bytes_written / + double w_amp = (compaction_stats_sum->bytes_written + + compaction_stats_sum->bytes_written_blob) / static_cast(curr_ingest + 1); // Stats summary across levels std::map sum_stats; @@ -1215,9 +1439,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { value->append(buf); // Print stats for each level + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); std::map> levels_stats; CompactionStats compaction_stats_sum; - DumpCFMapStats(&levels_stats, &compaction_stats_sum); + DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum); for (int l = 0; l < number_levels_; ++l) { if (levels_stats.find(l) != levels_stats.end()) { PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]); @@ -1253,7 +1478,8 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { CompactionStats interval_stats(compaction_stats_sum); interval_stats.Subtract(cf_stats_snapshot_.comp_stats); double w_amp = - interval_stats.bytes_written / static_cast(interval_ingest); + (interval_stats.bytes_written + interval_stats.bytes_written_blob) / + static_cast(interval_ingest); PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats); value->append(buf); @@ -1272,7 +1498,13 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { } } - double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec; + snprintf(buf, sizeof(buf), + "\nBlob file count: %" ROCKSDB_PRIszt ", total size: %.1f GB\n\n", + vstorage->GetBlobFiles().size(), + vstorage->GetTotalBlobFileSize() / kGB); + value->append(buf); + + double seconds_up = (clock_->NowMicros() - started_at_ + 1) / kMicrosInSec; double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up; snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n", seconds_up, interval_seconds_up); @@ -1312,8 +1544,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { uint64_t compact_micros = 0; for (int level = 0; level < number_levels_; level++) { compact_bytes_read += comp_stats_[level].bytes_read_output_level + - comp_stats_[level].bytes_read_non_output_levels; - compact_bytes_write += comp_stats_[level].bytes_written; + comp_stats_[level].bytes_read_non_output_levels + + comp_stats_[level].bytes_read_blob; + compact_bytes_write += comp_stats_[level].bytes_written + + comp_stats_[level].bytes_written_blob; compact_micros += comp_stats_[level].micros; } @@ -1384,6 +1618,16 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; cf_stats_snapshot_.comp_stats = compaction_stats_sum; cf_stats_snapshot_.stall_count = total_stall_count; + + // Always treat CFStats context as "background" + Status s = CollectCacheEntryStats(/*foreground=*/false); + if (s.ok()) { + value->append(cache_entry_stats_.ToString(clock_)); + } else { + value->append("Block cache: "); + value->append(s.ToString()); + value->append("\n"); + } } void InternalStats::DumpCFFileHistogram(std::string* value) { @@ -1406,7 +1650,7 @@ void InternalStats::DumpCFFileHistogram(std::string* value) { << blob_file_read_latency_.ToString() << '\n'; } - *value = oss.str(); + value->append(oss.str()); } #else diff --git a/db/internal_stats.h b/db/internal_stats.h index edb2c0582cb..023bf3b5ef9 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -9,16 +9,22 @@ // #pragma once + #include +#include #include #include +#include "cache/cache_entry_roles.h" #include "db/version_set.h" +#include "rocksdb/system_clock.h" class ColumnFamilyData; namespace ROCKSDB_NAMESPACE { +template +class CacheEntryStatsCollector; class DBImpl; class MemTableList; @@ -44,7 +50,9 @@ struct DBPropertyInfo { Version* version); // @param props Map of general properties to populate - bool (InternalStats::*handle_map)(std::map* props); + // @param suffix Argument portion of the property. (see handle_string) + bool (InternalStats::*handle_map)(std::map* props, + Slice suffix); // handle the string type properties rely on DBImpl methods // @param value Value-result argument for storing the property's string value @@ -76,6 +84,8 @@ enum class LevelStatType { AVG_SEC, KEY_IN, KEY_DROP, + R_BLOB_GB, + W_BLOB_GB, TOTAL // total number of types }; @@ -120,18 +130,7 @@ class InternalStats { kIntStatsNumMax, }; - InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) - : db_stats_{}, - cf_stats_value_{}, - cf_stats_count_{}, - comp_stats_(num_levels), - comp_stats_by_pri_(Env::Priority::TOTAL), - file_read_latency_(num_levels), - bg_error_count_(0), - number_levels_(num_levels), - env_(env), - cfd_(cfd), - started_at_(env->NowMicros()) {} + InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd); // Per level compaction stats. comp_stats_[level] stores the stats for // compactions that produced data for the specified "level". @@ -139,32 +138,42 @@ class InternalStats { uint64_t micros; uint64_t cpu_micros; - // The number of bytes read from all non-output levels + // The number of bytes read from all non-output levels (table files) uint64_t bytes_read_non_output_levels; - // The number of bytes read from the compaction output level. + // The number of bytes read from the compaction output level (table files) uint64_t bytes_read_output_level; - // Total number of bytes written during compaction + // The number of bytes read from blob files + uint64_t bytes_read_blob; + + // Total number of bytes written to table files during compaction uint64_t bytes_written; - // Total number of bytes moved to the output level + // Total number of bytes written to blob files during compaction + uint64_t bytes_written_blob; + + // Total number of bytes moved to the output level (table files) uint64_t bytes_moved; - // The number of compaction input files in all non-output levels. + // The number of compaction input files in all non-output levels (table + // files) int num_input_files_in_non_output_levels; - // The number of compaction input files in the output level. + // The number of compaction input files in the output level (table files) int num_input_files_in_output_level; - // The number of compaction output files. + // The number of compaction output files (table files) int num_output_files; + // The number of compaction output files (blob files) + int num_output_files_blob; + // Total incoming entries during compaction between levels N and N+1 uint64_t num_input_records; // Accumulated diff number of entries - // (num input entries - num output entires) for compaction levels N and N+1 + // (num input entries - num output entries) for compaction levels N and N+1 uint64_t num_dropped_records; // Number of compactions done @@ -178,11 +187,14 @@ class InternalStats { cpu_micros(0), bytes_read_non_output_levels(0), bytes_read_output_level(0), + bytes_read_blob(0), bytes_written(0), + bytes_written_blob(0), bytes_moved(0), num_input_files_in_non_output_levels(0), num_input_files_in_output_level(0), num_output_files(0), + num_output_files_blob(0), num_input_records(0), num_dropped_records(0), count(0) { @@ -197,11 +209,14 @@ class InternalStats { cpu_micros(0), bytes_read_non_output_levels(0), bytes_read_output_level(0), + bytes_read_blob(0), bytes_written(0), + bytes_written_blob(0), bytes_moved(0), num_input_files_in_non_output_levels(0), num_input_files_in_output_level(0), num_output_files(0), + num_output_files_blob(0), num_input_records(0), num_dropped_records(0), count(c) { @@ -222,12 +237,15 @@ class InternalStats { cpu_micros(c.cpu_micros), bytes_read_non_output_levels(c.bytes_read_non_output_levels), bytes_read_output_level(c.bytes_read_output_level), + bytes_read_blob(c.bytes_read_blob), bytes_written(c.bytes_written), + bytes_written_blob(c.bytes_written_blob), bytes_moved(c.bytes_moved), num_input_files_in_non_output_levels( c.num_input_files_in_non_output_levels), num_input_files_in_output_level(c.num_input_files_in_output_level), num_output_files(c.num_output_files), + num_output_files_blob(c.num_output_files_blob), num_input_records(c.num_input_records), num_dropped_records(c.num_dropped_records), count(c.count) { @@ -242,12 +260,15 @@ class InternalStats { cpu_micros = c.cpu_micros; bytes_read_non_output_levels = c.bytes_read_non_output_levels; bytes_read_output_level = c.bytes_read_output_level; + bytes_read_blob = c.bytes_read_blob; bytes_written = c.bytes_written; + bytes_written_blob = c.bytes_written_blob; bytes_moved = c.bytes_moved; num_input_files_in_non_output_levels = c.num_input_files_in_non_output_levels; num_input_files_in_output_level = c.num_input_files_in_output_level; num_output_files = c.num_output_files; + num_output_files_blob = c.num_output_files_blob; num_input_records = c.num_input_records; num_dropped_records = c.num_dropped_records; count = c.count; @@ -264,11 +285,14 @@ class InternalStats { this->cpu_micros = 0; this->bytes_read_non_output_levels = 0; this->bytes_read_output_level = 0; + this->bytes_read_blob = 0; this->bytes_written = 0; + this->bytes_written_blob = 0; this->bytes_moved = 0; this->num_input_files_in_non_output_levels = 0; this->num_input_files_in_output_level = 0; this->num_output_files = 0; + this->num_output_files_blob = 0; this->num_input_records = 0; this->num_dropped_records = 0; this->count = 0; @@ -283,13 +307,16 @@ class InternalStats { this->cpu_micros += c.cpu_micros; this->bytes_read_non_output_levels += c.bytes_read_non_output_levels; this->bytes_read_output_level += c.bytes_read_output_level; + this->bytes_read_blob += c.bytes_read_blob; this->bytes_written += c.bytes_written; + this->bytes_written_blob += c.bytes_written_blob; this->bytes_moved += c.bytes_moved; this->num_input_files_in_non_output_levels += c.num_input_files_in_non_output_levels; this->num_input_files_in_output_level += c.num_input_files_in_output_level; this->num_output_files += c.num_output_files; + this->num_output_files_blob += c.num_output_files_blob; this->num_input_records += c.num_input_records; this->num_dropped_records += c.num_dropped_records; this->count += c.count; @@ -304,13 +331,16 @@ class InternalStats { this->cpu_micros -= c.cpu_micros; this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels; this->bytes_read_output_level -= c.bytes_read_output_level; + this->bytes_read_blob -= c.bytes_read_blob; this->bytes_written -= c.bytes_written; + this->bytes_written_blob -= c.bytes_written_blob; this->bytes_moved -= c.bytes_moved; this->num_input_files_in_non_output_levels -= c.num_input_files_in_non_output_levels; this->num_input_files_in_output_level -= c.num_input_files_in_output_level; this->num_output_files -= c.num_output_files; + this->num_output_files_blob -= c.num_output_files_blob; this->num_input_records -= c.num_input_records; this->num_dropped_records -= c.num_dropped_records; this->count -= c.count; @@ -321,6 +351,39 @@ class InternalStats { } }; + // For use with CacheEntryStatsCollector + struct CacheEntryRoleStats { + uint64_t cache_capacity = 0; + std::string cache_id; + std::array total_charges; + std::array entry_counts; + uint32_t collection_count = 0; + uint32_t copies_of_last_collection = 0; + uint64_t last_start_time_micros_ = 0; + uint64_t last_end_time_micros_ = 0; + + void Clear() { + // Wipe everything except collection_count + uint32_t saved_collection_count = collection_count; + *this = CacheEntryRoleStats(); + collection_count = saved_collection_count; + } + + void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); + std::function + GetEntryCallback(); + void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); + void SkippedCollection(); + + std::string ToString(SystemClock* clock) const; + void ToMap(std::map* values, + SystemClock* clock) const; + + private: + std::unordered_map role_map_; + uint64_t GetLastDurationMicros() const; + }; + void Clear() { for (int i = 0; i < kIntStatsNumMax; i++) { db_stats_[i].store(0); @@ -329,6 +392,7 @@ class InternalStats { cf_stats_count_[i] = 0; cf_stats_value_[i] = 0; } + cache_entry_stats_.Clear(); for (auto& comp_stat : comp_stats_) { comp_stat.Clear(); } @@ -339,7 +403,7 @@ class InternalStats { cf_stats_snapshot_.Clear(); db_stats_snapshot_.Clear(); bg_error_count_ = 0; - started_at_ = env_->NowMicros(); + started_at_ = clock_->NowMicros(); } void AddCompactionStats(int level, Env::Priority thread_pri, @@ -401,6 +465,15 @@ class InternalStats { return comp_stats_; } + const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats(bool foreground) { + Status s = CollectCacheEntryStats(foreground); + if (!s.ok()) { + assert(false); + cache_entry_stats_.Clear(); + } + return cache_entry_stats_; + } + // Store a mapping from the user-facing DB::Properties string to our // DBPropertyInfo struct used internally for retrieving properties. static const std::unordered_map ppt_name_to_info; @@ -409,6 +482,7 @@ class InternalStats { void DumpDBStats(std::string* value); void DumpCFMapStats(std::map* cf_stats); void DumpCFMapStats( + const VersionStorageInfo* vstorage, std::map>* level_stats, CompactionStats* compaction_stats_sum); void DumpCFMapStatsByPriority( @@ -420,11 +494,16 @@ class InternalStats { bool HandleBlockCacheStat(Cache** block_cache); + Status CollectCacheEntryStats(bool foreground); + // Per-DB stats std::atomic db_stats_[kIntStatsNumMax]; // Per-ColumnFamily stats uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; + CacheEntryRoleStats cache_entry_stats_; + std::shared_ptr> + cache_entry_stats_collector_; // Per-ColumnFamily/level compaction stats std::vector comp_stats_; std::vector comp_stats_by_pri_; @@ -525,7 +604,8 @@ class InternalStats { bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix); bool HandleLevelStats(std::string* value, Slice suffix); bool HandleStats(std::string* value, Slice suffix); - bool HandleCFMapStats(std::map* compaction_stats); + bool HandleCFMapStats(std::map* compaction_stats, + Slice suffix); bool HandleCFStats(std::string* value, Slice suffix); bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); bool HandleCFFileHistogram(std::string* value, Slice suffix); @@ -533,6 +613,10 @@ class InternalStats { bool HandleSsTables(std::string* value, Slice suffix); bool HandleAggregatedTableProperties(std::string* value, Slice suffix); bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesMap( + std::map* values, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevelMap( + std::map* values, Slice suffix); bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, Version* version); bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, @@ -587,6 +671,9 @@ class InternalStats { bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version); bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleBlockCacheEntryStats(std::string* value, Slice suffix); + bool HandleBlockCacheEntryStatsMap(std::map* values, + Slice suffix); // Total number of background errors encountered. Every time a flush task // or compaction task fails, this counter is incremented. The failure can // be caused by any possible reason, including file system errors, out of @@ -595,7 +682,7 @@ class InternalStats { uint64_t bg_error_count_; const int number_levels_; - Env* env_; + SystemClock* clock_; ColumnFamilyData* cfd_; uint64_t started_at_; }; @@ -634,18 +721,22 @@ class InternalStats { kIntStatsNumMax, }; - InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {} + InternalStats(int /*num_levels*/, SystemClock* /*clock*/, + ColumnFamilyData* /*cfd*/) {} struct CompactionStats { uint64_t micros; uint64_t cpu_micros; uint64_t bytes_read_non_output_levels; uint64_t bytes_read_output_level; + uint64_t bytes_read_blob; uint64_t bytes_written; + uint64_t bytes_written_blob; uint64_t bytes_moved; int num_input_files_in_non_output_levels; int num_input_files_in_output_level; int num_output_files; + int num_output_files_blob; uint64_t num_input_records; uint64_t num_dropped_records; int count; @@ -673,6 +764,8 @@ class InternalStats { HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; } + HistogramImpl* GetBlobFileReadHist() { return nullptr; } + uint64_t GetBackgroundErrorCount() const { return 0; } uint64_t BumpAndGetBackgroundErrorCount() { return 0; } diff --git a/db/job_context.h b/db/job_context.h index d09937d11f0..cf48888e0db 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -23,7 +23,7 @@ struct SuperVersion; struct SuperVersionContext { struct WriteStallNotification { WriteStallInfo write_stall_info; - const ImmutableCFOptions* immutable_cf_options; + const ImmutableOptions* immutable_options; }; autovector superversions_to_free; @@ -57,15 +57,16 @@ struct SuperVersionContext { #endif } - void PushWriteStallNotification( - WriteStallCondition old_cond, WriteStallCondition new_cond, - const std::string& name, const ImmutableCFOptions* ioptions) { + void PushWriteStallNotification(WriteStallCondition old_cond, + WriteStallCondition new_cond, + const std::string& name, + const ImmutableOptions* ioptions) { #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) WriteStallNotification notif; notif.write_stall_info.cf_name = name; notif.write_stall_info.condition.prev = old_cond; notif.write_stall_info.condition.cur = new_cond; - notif.immutable_cf_options = ioptions; + notif.immutable_options = ioptions; write_stall_notifications.push_back(notif); #else (void)old_cond; @@ -79,7 +80,7 @@ struct SuperVersionContext { #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) // notify listeners on changed write stall conditions for (auto& notif : write_stall_notifications) { - for (auto& listener : notif.immutable_cf_options->listeners) { + for (auto& listener : notif.immutable_options->listeners) { listener->OnStallConditionsChanged(notif.write_stall_info); } } diff --git a/db/kv_checksum.h b/db/kv_checksum.h new file mode 100644 index 00000000000..ba15dca3bb1 --- /dev/null +++ b/db/kv_checksum.h @@ -0,0 +1,424 @@ +// Copyright (c) 2020-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file contains classes containing fields to protect individual entries. +// The classes are named "ProtectionInfo", where indicates the +// combination of fields that are covered. Each field has a single letter +// abbreviation as follows. +// +// K = key +// V = value +// O = optype aka value type +// T = timestamp +// S = seqno +// C = CF ID +// +// Then, for example, a class that protects an entry consisting of key, value, +// optype, timestamp, and CF ID (i.e., a `WriteBatch` entry) would be named +// `ProtectionInfoKVOTC`. +// +// The `ProtectionInfo.*` classes are templated on the integer type used to hold +// the XOR of hashes for each field. Only unsigned integer types are supported, +// and the maximum supported integer width is 64 bits. When the integer type is +// narrower than the hash values, we lop off the most significant bits to make +// them fit. +// +// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do +// not currently make the byte order consistent for integer fields before +// hashing them, so the resulting values are endianness-dependent. + +#pragma once + +#include + +#include "db/dbformat.h" +#include "rocksdb/types.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +template +class ProtectionInfo; +template +class ProtectionInfoKVOT; +template +class ProtectionInfoKVOTC; +template +class ProtectionInfoKVOTS; + +// Aliases for 64-bit protection infos. +typedef ProtectionInfo ProtectionInfo64; +typedef ProtectionInfoKVOT ProtectionInfoKVOT64; +typedef ProtectionInfoKVOTC ProtectionInfoKVOTC64; +typedef ProtectionInfoKVOTS ProtectionInfoKVOTS64; + +template +class ProtectionInfo { + public: + ProtectionInfo() = default; + + Status GetStatus() const; + ProtectionInfoKVOT ProtectKVOT(const Slice& key, const Slice& value, + ValueType op_type, + const Slice& timestamp) const; + ProtectionInfoKVOT ProtectKVOT(const SliceParts& key, + const SliceParts& value, ValueType op_type, + const Slice& timestamp) const; + + private: + friend class ProtectionInfoKVOT; + friend class ProtectionInfoKVOTS; + friend class ProtectionInfoKVOTC; + + // Each field is hashed with an independent value so we can catch fields being + // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, + // and we should instead vary our seeds by a large odd number. This value by + // which we increment (0xD28AAD72F49BD50B) was taken from + // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd + // number. The values are computed manually since the Windows C++ compiler + // complains about the overflow when adding constants. + static const uint64_t kSeedK = 0; + static const uint64_t kSeedV = 0xD28AAD72F49BD50B; + static const uint64_t kSeedO = 0xA5155AE5E937AA16; + static const uint64_t kSeedT = 0x77A00858DDD37F21; + static const uint64_t kSeedS = 0x4A2AB5CBD26F542C; + static const uint64_t kSeedC = 0x1CB5633EC70B2937; + + ProtectionInfo(T val) : val_(val) { + static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); + } + + T GetVal() const { return val_; } + void SetVal(T val) { val_ = val; } + + T val_ = 0; +}; + +template +class ProtectionInfoKVOT { + public: + ProtectionInfoKVOT() = default; + + ProtectionInfo StripKVOT(const Slice& key, const Slice& value, + ValueType op_type, const Slice& timestamp) const; + ProtectionInfo StripKVOT(const SliceParts& key, const SliceParts& value, + ValueType op_type, const Slice& timestamp) const; + + ProtectionInfoKVOTC ProtectC(ColumnFamilyId column_family_id) const; + ProtectionInfoKVOTS ProtectS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key); + void UpdateK(const SliceParts& old_key, const SliceParts& new_key); + void UpdateV(const Slice& old_value, const Slice& new_value); + void UpdateV(const SliceParts& old_value, const SliceParts& new_value); + void UpdateO(ValueType old_op_type, ValueType new_op_type); + void UpdateT(const Slice& old_timestamp, const Slice& new_timestamp); + + private: + friend class ProtectionInfo; + friend class ProtectionInfoKVOTS; + friend class ProtectionInfoKVOTC; + + ProtectionInfoKVOT(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKVOT) == sizeof(T), ""); + } + + T GetVal() const { return info_.GetVal(); } + void SetVal(T val) { info_.SetVal(val); } + + ProtectionInfo info_; +}; + +template +class ProtectionInfoKVOTC { + public: + ProtectionInfoKVOTC() = default; + + ProtectionInfoKVOT StripC(ColumnFamilyId column_family_id) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvot_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvot_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvot_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvot_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvot_.UpdateO(old_op_type, new_op_type); + } + void UpdateT(const Slice& old_timestamp, const Slice& new_timestamp) { + kvot_.UpdateT(old_timestamp, new_timestamp); + } + void UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id); + + private: + friend class ProtectionInfoKVOT; + + ProtectionInfoKVOTC(T val) : kvot_(val) { + static_assert(sizeof(ProtectionInfoKVOTC) == sizeof(T), ""); + } + + T GetVal() const { return kvot_.GetVal(); } + void SetVal(T val) { kvot_.SetVal(val); } + + ProtectionInfoKVOT kvot_; +}; + +template +class ProtectionInfoKVOTS { + public: + ProtectionInfoKVOTS() = default; + + ProtectionInfoKVOT StripS(SequenceNumber sequence_number) const; + + void UpdateK(const Slice& old_key, const Slice& new_key) { + kvot_.UpdateK(old_key, new_key); + } + void UpdateK(const SliceParts& old_key, const SliceParts& new_key) { + kvot_.UpdateK(old_key, new_key); + } + void UpdateV(const Slice& old_value, const Slice& new_value) { + kvot_.UpdateV(old_value, new_value); + } + void UpdateV(const SliceParts& old_value, const SliceParts& new_value) { + kvot_.UpdateV(old_value, new_value); + } + void UpdateO(ValueType old_op_type, ValueType new_op_type) { + kvot_.UpdateO(old_op_type, new_op_type); + } + void UpdateT(const Slice& old_timestamp, const Slice& new_timestamp) { + kvot_.UpdateT(old_timestamp, new_timestamp); + } + void UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number); + + private: + friend class ProtectionInfoKVOT; + + ProtectionInfoKVOTS(T val) : kvot_(val) { + static_assert(sizeof(ProtectionInfoKVOTS) == sizeof(T), ""); + } + + T GetVal() const { return kvot_.GetVal(); } + void SetVal(T val) { kvot_.SetVal(val); } + + ProtectionInfoKVOT kvot_; +}; + +template +Status ProtectionInfo::GetStatus() const { + if (val_ != 0) { + return Status::Corruption("ProtectionInfo mismatch"); + } + return Status::OK(); +} + +template +ProtectionInfoKVOT ProtectionInfo::ProtectKVOT( + const Slice& key, const Slice& value, ValueType op_type, + const Slice& timestamp) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + val = val ^ + static_cast(GetSliceNPHash64(timestamp, ProtectionInfo::kSeedT)); + return ProtectionInfoKVOT(val); +} + +template +ProtectionInfoKVOT ProtectionInfo::ProtectKVOT( + const SliceParts& key, const SliceParts& value, ValueType op_type, + const Slice& timestamp) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + val = val ^ + static_cast(GetSliceNPHash64(timestamp, ProtectionInfo::kSeedT)); + return ProtectionInfoKVOT(val); +} + +template +void ProtectionInfoKVOT::UpdateK(const Slice& old_key, + const Slice& new_key) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSliceNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVOT::UpdateK(const SliceParts& old_key, + const SliceParts& new_key) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_key, ProtectionInfo::kSeedK)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_key, ProtectionInfo::kSeedK)); + SetVal(val); +} + +template +void ProtectionInfoKVOT::UpdateV(const Slice& old_value, + const Slice& new_value) { + T val = GetVal(); + val = val ^ + static_cast(GetSliceNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(GetSliceNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVOT::UpdateV(const SliceParts& old_value, + const SliceParts& new_value) { + T val = GetVal(); + val = val ^ static_cast( + GetSlicePartsNPHash64(old_value, ProtectionInfo::kSeedV)); + val = val ^ static_cast( + GetSlicePartsNPHash64(new_value, ProtectionInfo::kSeedV)); + SetVal(val); +} + +template +void ProtectionInfoKVOT::UpdateO(ValueType old_op_type, + ValueType new_op_type) { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&old_op_type), + sizeof(old_op_type), + ProtectionInfo::kSeedO)); + val = val ^ static_cast(NPHash64(reinterpret_cast(&new_op_type), + sizeof(new_op_type), + ProtectionInfo::kSeedO)); + SetVal(val); +} + +template +void ProtectionInfoKVOT::UpdateT(const Slice& old_timestamp, + const Slice& new_timestamp) { + T val = GetVal(); + val = val ^ static_cast( + GetSliceNPHash64(old_timestamp, ProtectionInfo::kSeedT)); + val = val ^ static_cast( + GetSliceNPHash64(new_timestamp, ProtectionInfo::kSeedT)); + SetVal(val); +} + +template +ProtectionInfo ProtectionInfoKVOT::StripKVOT( + const Slice& key, const Slice& value, ValueType op_type, + const Slice& timestamp) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + val = val ^ + static_cast(GetSliceNPHash64(timestamp, ProtectionInfo::kSeedT)); + return ProtectionInfo(val); +} + +template +ProtectionInfo ProtectionInfoKVOT::StripKVOT( + const SliceParts& key, const SliceParts& value, ValueType op_type, + const Slice& timestamp) const { + T val = GetVal(); + val = val ^ + static_cast(GetSlicePartsNPHash64(key, ProtectionInfo::kSeedK)); + val = val ^ + static_cast(GetSlicePartsNPHash64(value, ProtectionInfo::kSeedV)); + val = val ^ + static_cast(NPHash64(reinterpret_cast(&op_type), + sizeof(op_type), ProtectionInfo::kSeedO)); + val = val ^ + static_cast(GetSliceNPHash64(timestamp, ProtectionInfo::kSeedT)); + return ProtectionInfo(val); +} + +template +ProtectionInfoKVOTC ProtectionInfoKVOT::ProtectC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVOTC(val); +} + +template +ProtectionInfoKVOT ProtectionInfoKVOTC::StripC( + ColumnFamilyId column_family_id) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&column_family_id), + sizeof(column_family_id), ProtectionInfo::kSeedC)); + return ProtectionInfoKVOT(val); +} + +template +void ProtectionInfoKVOTC::UpdateC(ColumnFamilyId old_column_family_id, + ColumnFamilyId new_column_family_id) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_column_family_id), + sizeof(old_column_family_id), ProtectionInfo::kSeedC)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_column_family_id), + sizeof(new_column_family_id), ProtectionInfo::kSeedC)); + SetVal(val); +} + +template +ProtectionInfoKVOTS ProtectionInfoKVOT::ProtectS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVOTS(val); +} + +template +ProtectionInfoKVOT ProtectionInfoKVOTS::StripS( + SequenceNumber sequence_number) const { + T val = GetVal(); + val = val ^ static_cast(NPHash64(reinterpret_cast(&sequence_number), + sizeof(sequence_number), + ProtectionInfo::kSeedS)); + return ProtectionInfoKVOT(val); +} + +template +void ProtectionInfoKVOTS::UpdateS(SequenceNumber old_sequence_number, + SequenceNumber new_sequence_number) { + T val = GetVal(); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&old_sequence_number), + sizeof(old_sequence_number), ProtectionInfo::kSeedS)); + val = val ^ static_cast(NPHash64( + reinterpret_cast(&new_sequence_number), + sizeof(new_sequence_number), ProtectionInfo::kSeedS)); + SetVal(val); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/listener_test.cc b/db/listener_test.cc index 5f0511d78f6..7c6eb9fe05e 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -10,7 +10,6 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" -#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "rocksdb/cache.h" @@ -193,10 +192,10 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], nullptr, nullptr)); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size()); @@ -212,6 +211,10 @@ class TestFlushListener : public EventListener { : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { db_closed = false; } + + virtual ~TestFlushListener() { + prev_fc_info_.status.PermitUncheckedError(); // Ignore the status + } void OnTableFileCreated( const TableFileCreationInfo& info) override { // remember the info for later checking the FlushJobInfo. @@ -334,7 +337,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(listener->flushed_dbs_.size(), i); ASSERT_EQ(listener->flushed_column_family_names_.size(), i); } @@ -418,7 +421,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db)); for (size_t c = 0; c < cf_names.size(); ++c) { ColumnFamilyHandle* handle; - db->CreateColumnFamily(cf_opts, cf_names[c], &handle); + ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle)); handles.push_back(handle); } @@ -436,7 +439,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { for (size_t c = 0; c < cf_names.size(); ++c) { for (int d = 0; d < kNumDBs; ++d) { ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c])); - static_cast_with_check(dbs[d])->TEST_WaitForFlushMemTable(); + ASSERT_OK( + static_cast_with_check(dbs[d])->TEST_WaitForFlushMemTable()); } } @@ -495,10 +499,10 @@ TEST_F(EventListenerTest, DisableBGCompaction) { // keep writing until writes are forced to stop. for (int i = 0; static_cast(cf_meta.file_count) < kSlowdownTrigger * 10; ++i) { - Put(1, ToString(i), std::string(10000, 'x'), WriteOptions()); + ASSERT_OK(Put(1, ToString(i), std::string(10000, 'x'), WriteOptions())); FlushOptions fo; fo.allow_write_stall = true; - db_->Flush(fo, handles_[1]); + ASSERT_OK(db_->Flush(fo, handles_[1])); db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); } ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9); @@ -535,7 +539,7 @@ TEST_F(EventListenerTest, CompactionReasonLevel) { for (int i = 0; i < 4; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(listener->compaction_reasons_.size(), 1); ASSERT_EQ(listener->compaction_reasons_[0], @@ -552,14 +556,14 @@ TEST_F(EventListenerTest, CompactionReasonLevel) { } // Do a trivial move from L0 -> L1 - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); options.max_bytes_for_level_base = 1; Close(); listener->compaction_reasons_.clear(); Reopen(options); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 1); for (auto compaction_reason : listener->compaction_reasons_) { @@ -571,7 +575,7 @@ TEST_F(EventListenerTest, CompactionReasonLevel) { listener->compaction_reasons_.clear(); Reopen(options); - Put("key", "value"); + ASSERT_OK(Put("key", "value")); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -605,7 +609,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) { for (int i = 0; i < 8; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -623,7 +627,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) { for (int i = 0; i < 8; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -635,7 +639,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) { listener->compaction_reasons_.clear(); Reopen(options); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -664,7 +668,7 @@ TEST_F(EventListenerTest, CompactionReasonFIFO) { for (int i = 0; i < 4; i++) { GenerateNewRandomFile(&rnd); } - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->compaction_reasons_.size(), 0); for (auto compaction_reason : listener->compaction_reasons_) { @@ -676,7 +680,7 @@ class TableFileCreationListener : public EventListener { public: class TestEnv : public EnvWrapper { public: - TestEnv() : EnvWrapper(Env::Default()) {} + explicit TestEnv(Env* t) : EnvWrapper(t) {} void SetStatus(Status s) { status_ = s; } @@ -688,7 +692,7 @@ class TableFileCreationListener : public EventListener { return status_; } } - return Env::Default()->NewWritableFile(fname, result, options); + return target()->NewWritableFile(fname, result, options); } private: @@ -766,7 +770,6 @@ class TableFileCreationListener : public EventListener { } } - TestEnv test_env; int started_[2]; int finished_[2]; int failure_[2]; @@ -775,44 +778,48 @@ class TableFileCreationListener : public EventListener { TEST_F(EventListenerTest, TableFileCreationListenersTest) { auto listener = std::make_shared(); Options options; + std::unique_ptr test_env( + new TableFileCreationListener::TestEnv(CurrentOptions().env)); options.create_if_missing = true; options.listeners.push_back(listener); - options.env = &listener->test_env; + options.env = test_env.get(); DestroyAndReopen(options); ASSERT_OK(Put("foo", "aaa")); ASSERT_OK(Put("bar", "bbb")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); - ASSERT_OK(Put("foo", "aaa1")); ASSERT_OK(Put("bar", "bbb1")); - listener->test_env.SetStatus(Status::NotSupported("not supported")); + test_env->SetStatus(Status::NotSupported("not supported")); ASSERT_NOK(Flush()); listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); - listener->test_env.SetStatus(Status::OK()); + test_env->SetStatus(Status::OK()); Reopen(options); ASSERT_OK(Put("foo", "aaa2")); ASSERT_OK(Put("bar", "bbb2")); ASSERT_OK(Flush()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); const Slice kRangeStart = "a"; const Slice kRangeEnd = "z"; - dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK( + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0); ASSERT_OK(Put("foo", "aaa3")); ASSERT_OK(Put("bar", "bbb3")); ASSERT_OK(Flush()); - listener->test_env.SetStatus(Status::NotSupported("not supported")); - dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd); - dbfull()->TEST_WaitForCompact(); + test_env->SetStatus(Status::NotSupported("not supported")); + ASSERT_NOK( + dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1); + Close(); } class MemTableSealedListener : public EventListener { @@ -833,6 +840,7 @@ class MemTableSealedListener : public EventListener { TEST_F(EventListenerTest, MemTableSealedListenerTest) { auto listener = std::make_shared(); Options options; + options.env = CurrentOptions().env; options.create_if_missing = true; options.listeners.push_back(listener); DestroyAndReopen(options); @@ -1066,7 +1074,7 @@ TEST_F(EventListenerTest, OnFileOperationTest) { TestFileOperationListener* listener = new TestFileOperationListener(); options.listeners.emplace_back(listener); - options.use_direct_io_for_flush_and_compaction = true; + options.use_direct_io_for_flush_and_compaction = false; Status s = TryReopen(options); if (s.IsInvalidArgument()) { options.use_direct_io_for_flush_and_compaction = false; @@ -1075,8 +1083,8 @@ TEST_F(EventListenerTest, OnFileOperationTest) { } DestroyAndReopen(options); ASSERT_OK(Put("foo", "aaa")); - dbfull()->Flush(FlushOptions()); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_GE(listener->file_writes_.load(), listener->file_writes_success_.load()); ASSERT_GT(listener->file_writes_.load(), 0); diff --git a/db/log_test.cc b/db/log_test.cc index 2697619683f..2e993d8f905 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -9,7 +9,6 @@ #include "db/log_reader.h" #include "db/log_writer.h" -#include "env/composite_env_wrapper.h" #include "file/sequence_file_reader.h" #include "file/writable_file_writer.h" #include "rocksdb/env.h" @@ -50,7 +49,7 @@ static std::string RandomSkewedString(int i, Random* rnd) { // get<1>(tuple): true if allow retry after read EOF, false otherwise class LogTest : public ::testing::TestWithParam> { private: - class StringSource : public SequentialFile { + class StringSource : public FSSequentialFile { public: Slice& contents_; bool force_error_; @@ -68,7 +67,8 @@ class LogTest : public ::testing::TestWithParam> { returned_partial_(false), fail_after_read_partial_(fail_after_read_partial) {} - Status Read(size_t n, Slice* result, char* scratch) override { + IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { if (fail_after_read_partial_) { EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error"; } @@ -81,7 +81,7 @@ class LogTest : public ::testing::TestWithParam> { contents_.remove_prefix(force_error_position_); force_error_ = false; returned_partial_ = true; - return Status::Corruption("read error"); + return IOStatus::Corruption("read error"); } } @@ -106,28 +106,21 @@ class LogTest : public ::testing::TestWithParam> { *result = Slice(scratch, n); contents_.remove_prefix(n); - return Status::OK(); + return IOStatus::OK(); } - Status Skip(uint64_t n) override { + IOStatus Skip(uint64_t n) override { if (n > contents_.size()) { contents_.clear(); - return Status::NotFound("in-memory file skipepd past end"); + return IOStatus::NotFound("in-memory file skipepd past end"); } contents_.remove_prefix(n); - return Status::OK(); + return IOStatus::OK(); } }; - inline StringSource* GetStringSourceFromLegacyReader( - SequentialFileReader* reader) { - LegacySequentialFileWrapper* file = - static_cast(reader->file()); - return static_cast(file->target()); - } - class ReportCollector : public Reader::Reporter { public: size_t dropped_bytes_; @@ -140,29 +133,17 @@ class LogTest : public ::testing::TestWithParam> { } }; - std::string& dest_contents() { - auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); - assert(dest); - return dest->contents_; - } + std::string& dest_contents() { return sink_->contents_; } - const std::string& dest_contents() const { - auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); - assert(dest); - return dest->contents_; - } + const std::string& dest_contents() const { return sink_->contents_; } - void reset_source_contents() { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - assert(src); - src->contents_ = dest_contents(); - } + void reset_source_contents() { source_->contents_ = dest_contents(); } Slice reader_contents_; - std::unique_ptr dest_holder_; - std::unique_ptr source_holder_; + test::StringSink* sink_; + StringSource* source_; ReportCollector report_; - Writer writer_; + std::unique_ptr writer_; std::unique_ptr reader_; protected: @@ -171,19 +152,23 @@ class LogTest : public ::testing::TestWithParam> { public: LogTest() : reader_contents_(), - dest_holder_(test::GetWritableFileWriter( - new test::StringSink(&reader_contents_), "" /* don't care */)), - source_holder_(test::GetSequentialFileReader( - new StringSource(reader_contents_, !std::get<1>(GetParam())), - "" /* file name */)), - writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())), + sink_(new test::StringSink(&reader_contents_)), + source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))), allow_retry_read_(std::get<1>(GetParam())) { + std::unique_ptr sink_holder(sink_); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(sink_holder), "" /* don't care */, FileOptions())); + writer_.reset( + new Writer(std::move(file_writer), 123, std::get<0>(GetParam()))); + std::unique_ptr source_holder(source_); + std::unique_ptr file_reader( + new SequentialFileReader(std::move(source_holder), "" /* file name */)); if (allow_retry_read_) { - reader_.reset(new FragmentBufferedReader( - nullptr, std::move(source_holder_), &report_, true /* checksum */, - 123 /* log_number */)); + reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader), + &report_, true /* checksum */, + 123 /* log_number */)); } else { - reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_, + reader_.reset(new Reader(nullptr, std::move(file_reader), &report_, true /* checksum */, 123 /* log_number */)); } } @@ -191,7 +176,7 @@ class LogTest : public ::testing::TestWithParam> { Slice* get_reader_contents() { return &reader_contents_; } void Write(const std::string& msg) { - writer_.AddRecord(Slice(msg)); + ASSERT_OK(writer_->AddRecord(Slice(msg))); } size_t WrittenBytes() const { @@ -219,11 +204,7 @@ class LogTest : public ::testing::TestWithParam> { dest_contents()[offset] = new_byte; } - void ShrinkSize(int bytes) { - auto dest = test::GetStringSinkFromLegacyWriter(writer_.file()); - assert(dest); - dest->Drop(bytes); - } + void ShrinkSize(int bytes) { sink_->Drop(bytes); } void FixChecksum(int header_offset, int len, bool recyclable) { // Compute crc of type/len/data @@ -235,9 +216,8 @@ class LogTest : public ::testing::TestWithParam> { } void ForceError(size_t position = 0) { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - src->force_error_ = true; - src->force_error_position_ = position; + source_->force_error_ = true; + source_->force_error_position_ = position; } size_t DroppedBytes() const { @@ -249,14 +229,12 @@ class LogTest : public ::testing::TestWithParam> { } void ForceEOF(size_t position = 0) { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - src->force_eof_ = true; - src->force_eof_position_ = position; + source_->force_eof_ = true; + source_->force_eof_position_ = position; } void UnmarkEOF() { - auto src = GetStringSourceFromLegacyReader(reader_->file()); - src->returned_partial_ = false; + source_->returned_partial_ = false; reader_->UnmarkEOF(); } @@ -685,12 +663,13 @@ TEST_P(LogTest, Recycle) { while (get_reader_contents()->size() < log::kBlockSize * 2) { Write("xxxxxxxxxxxxxxxx"); } - std::unique_ptr dest_holder(test::GetWritableFileWriter( - new test::OverwritingStringSink(get_reader_contents()), - "" /* don't care */)); + std::unique_ptr sink( + new test::OverwritingStringSink(get_reader_contents())); + std::unique_ptr dest_holder(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); Writer recycle_writer(std::move(dest_holder), 123, true); - recycle_writer.AddRecord(Slice("foooo")); - recycle_writer.AddRecord(Slice("bar")); + ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); ASSERT_EQ("foooo", Read()); ASSERT_EQ("bar", Read()); @@ -718,10 +697,9 @@ class RetriableLogTest : public ::testing::TestWithParam { }; Slice contents_; - std::unique_ptr dest_holder_; + test::StringSink* sink_; std::unique_ptr log_writer_; Env* env_; - EnvOptions env_options_; const std::string test_dir_; const std::string log_file_; std::unique_ptr writer_; @@ -732,61 +710,58 @@ class RetriableLogTest : public ::testing::TestWithParam { public: RetriableLogTest() : contents_(), - dest_holder_(nullptr), + sink_(new test::StringSink(&contents_)), log_writer_(nullptr), env_(Env::Default()), test_dir_(test::PerThreadDBPath("retriable_log_test")), log_file_(test_dir_ + "/log"), writer_(nullptr), reader_(nullptr), - log_reader_(nullptr) {} + log_reader_(nullptr) { + std::unique_ptr sink_holder(sink_); + std::unique_ptr wfw(new WritableFileWriter( + std::move(sink_holder), "" /* file name */, FileOptions())); + log_writer_.reset(new Writer(std::move(wfw), 123, GetParam())); + } Status SetupTestEnv() { - dest_holder_.reset(test::GetWritableFileWriter( - new test::StringSink(&contents_), "" /* file name */)); - assert(dest_holder_ != nullptr); - log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam())); - assert(log_writer_ != nullptr); - Status s; - s = env_->CreateDirIfMissing(test_dir_); - std::unique_ptr writable_file; + FileOptions fopts; + auto fs = env_->GetFileSystem(); + s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr); + std::unique_ptr writable_file; if (s.ok()) { - s = env_->NewWritableFile(log_file_, &writable_file, env_options_); + s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr); } if (s.ok()) { - writer_.reset(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_, - env_options_)); - assert(writer_ != nullptr); + writer_.reset( + new WritableFileWriter(std::move(writable_file), log_file_, fopts)); + EXPECT_NE(writer_, nullptr); } - std::unique_ptr seq_file; + std::unique_ptr seq_file; if (s.ok()) { - s = env_->NewSequentialFile(log_file_, &seq_file, env_options_); + s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr); } if (s.ok()) { - reader_.reset(new SequentialFileReader( - NewLegacySequentialFileWrapper(seq_file), log_file_)); - assert(reader_ != nullptr); + reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_)); + EXPECT_NE(reader_, nullptr); log_reader_.reset(new FragmentBufferedReader( nullptr, std::move(reader_), &report_, true /* checksum */, 123 /* log_number */)); - assert(log_reader_ != nullptr); + EXPECT_NE(log_reader_, nullptr); } return s; } - std::string contents() { - auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file()); - assert(file != nullptr); - return file->contents_; - } + std::string contents() { return sink_->contents_; } - void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); } + void Encode(const std::string& msg) { + ASSERT_OK(log_writer_->AddRecord(Slice(msg))); + } void Write(const Slice& data) { - writer_->Append(data); - writer_->Sync(true); + ASSERT_OK(writer_->Append(data)); + ASSERT_OK(writer_->Sync(true)); } bool TryRead(std::string* result) { diff --git a/db/log_writer.cc b/db/log_writer.cc index e290eae6258..6a82f31e10e 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -33,7 +33,7 @@ Writer::Writer(std::unique_ptr&& dest, uint64_t log_number, Writer::~Writer() { if (dest_) { - WriteBuffer(); + WriteBuffer().PermitUncheckedError(); } } diff --git a/db/log_writer.h b/db/log_writer.h index 463826e88ee..1a91b21994d 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -8,8 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include - +#include #include #include "db/log_format.h" diff --git a/db/logs_with_prep_tracker.h b/db/logs_with_prep_tracker.h index 86c88012adc..7f9ece76bca 100644 --- a/db/logs_with_prep_tracker.h +++ b/db/logs_with_prep_tracker.h @@ -5,8 +5,8 @@ // #pragma once -#include #include +#include #include #include #include diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index c8039b5397a..9005e932a65 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -100,13 +100,13 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { options.compaction_filter = new DestroyAllCompactionFilter(); ASSERT_OK(DB::Open(options, dbname_, &db)); - db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); - db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); - db->Put(WriteOptions(), Slice("key3"), Slice("value3")); - db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3"))); + ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy"))); Slice key4("key4"); - db->CompactRange(CompactRangeOptions(), nullptr, &key4); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4)); Iterator* itr = db->NewIterator(ReadOptions()); itr->SeekToFirst(); ASSERT_TRUE(itr->Valid()); @@ -135,21 +135,21 @@ TEST_F(ManualCompactionTest, Test) { // create first key range WriteBatch batch; for (int i = 0; i < kNumKeys; i++) { - batch.Put(Key1(i), "value for range 1 key"); + ASSERT_OK(batch.Put(Key1(i), "value for range 1 key")); } ASSERT_OK(db->Write(WriteOptions(), &batch)); // create second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - batch.Put(Key2(i), "value for range 2 key"); + ASSERT_OK(batch.Put(Key2(i), "value for range 2 key")); } ASSERT_OK(db->Write(WriteOptions(), &batch)); // delete second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - batch.Delete(Key2(i)); + ASSERT_OK(batch.Delete(Key2(i))); } ASSERT_OK(db->Write(WriteOptions(), &batch)); @@ -160,7 +160,7 @@ TEST_F(ManualCompactionTest, Test) { Slice greatest(end_key.data(), end_key.size()); // commenting out the line below causes the example to work correctly - db->CompactRange(CompactRangeOptions(), &least, &greatest); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest)); // count the keys Iterator* iter = db->NewIterator(ReadOptions()); @@ -205,7 +205,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { Slice start("5"); Slice end("7"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); ASSERT_EQ(0, filter->NumKeys()); } @@ -215,7 +215,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { Slice start("3"); Slice end("7"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); ASSERT_EQ(2, filter->NumKeys()); ASSERT_EQ(0, filter->KeyLevel("4")); ASSERT_EQ(0, filter->KeyLevel("8")); @@ -227,7 +227,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { // no file has keys in range (-inf, 0] Slice end("0"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), nullptr, &end); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end)); ASSERT_EQ(0, filter->NumKeys()); } @@ -237,7 +237,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { // no file has keys in range [9, inf) Slice start("9"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), &start, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(0, filter->NumKeys()); } @@ -248,7 +248,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { Slice start("2"); Slice end("2"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); ASSERT_EQ(1, filter->NumKeys()); ASSERT_EQ(0, filter->KeyLevel("2")); } @@ -260,7 +260,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { Slice start("2"); Slice end("5"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end)); ASSERT_EQ(3, filter->NumKeys()); ASSERT_EQ(1, filter->KeyLevel("2")); ASSERT_EQ(1, filter->KeyLevel("4")); @@ -273,7 +273,7 @@ TEST_F(ManualCompactionTest, SkipLevel) { // [0, inf) overlaps all files Slice start("0"); filter->Reset(); - db->CompactRange(CompactRangeOptions(), &start, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(4, filter->NumKeys()); // 1 is first compacted to L1 and then further compacted into [2, 4, 8], // so finally the logged level for 1 is L1. diff --git a/db/memtable.cc b/db/memtable.cc index 53be973e795..2b2598658b1 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -13,7 +13,9 @@ #include #include #include + #include "db/dbformat.h" +#include "db/kv_checksum.h" #include "db/merge_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" @@ -41,7 +43,7 @@ namespace ROCKSDB_NAMESPACE { ImmutableMemTableOptions::ImmutableMemTableOptions( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options) : arena_block_size(mutable_cf_options.arena_block_size), memtable_prefix_bloom_bits( @@ -56,13 +58,13 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), inplace_callback(ioptions.inplace_callback), max_successive_merges(mutable_cf_options.max_successive_merges), - statistics(ioptions.statistics), - merge_operator(ioptions.merge_operator), - info_log(ioptions.info_log), + statistics(ioptions.stats), + merge_operator(ioptions.merge_operator.get()), + info_log(ioptions.logger), allow_data_in_errors(ioptions.allow_data_in_errors) {} MemTable::MemTable(const InternalKeyComparator& cmp, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, WriteBufferManager* write_buffer_manager, SequenceNumber latest_seq, uint32_t column_family_id) @@ -80,9 +82,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp, mutable_cf_options.memtable_huge_page_size), table_(ioptions.memtable_factory->CreateMemTableRep( comparator_, &arena_, mutable_cf_options.prefix_extractor.get(), - ioptions.info_log, column_family_id)), + ioptions.logger, column_family_id)), range_del_table_(SkipListFactory().CreateMemTableRep( - comparator_, &arena_, nullptr /* transform */, ioptions.info_log, + comparator_, &arena_, nullptr /* transform */, ioptions.logger, column_family_id)), is_range_del_table_empty_(true), data_size_(0), @@ -102,9 +104,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp, : 0), prefix_extractor_(mutable_cf_options.prefix_extractor.get()), flush_state_(FLUSH_NOT_REQUESTED), - env_(ioptions.env), + clock_(ioptions.clock), insert_with_hint_prefix_extractor_( - ioptions.memtable_insert_with_hint_prefix_extractor), + ioptions.memtable_insert_with_hint_prefix_extractor.get()), oldest_key_time_(std::numeric_limits::max()), atomic_flush_seqno_(kMaxSequenceNumber), approximate_memory_usage_(0) { @@ -118,7 +120,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, bloom_filter_.reset( new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, 6 /* hard coded 6 probes */, - moptions_.memtable_huge_page_size, ioptions.info_log)); + moptions_.memtable_huge_page_size, ioptions.logger)); } } @@ -221,7 +223,7 @@ void MemTable::UpdateOldestKeyTime() { uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed); if (oldest_key_time == std::numeric_limits::max()) { int64_t current_time = 0; - auto s = env_->GetCurrentTime(¤t_time); + auto s = clock_->GetCurrentTime(¤t_time); if (s.ok()) { assert(current_time >= 0); // If fail, the timestamp is already set. @@ -328,9 +330,11 @@ class MemTableIterator : public InternalIterator { PERF_COUNTER_ADD(seek_on_memtable_count, 1); if (bloom_) { // iterator should only use prefix bloom filter - Slice user_k(ExtractUserKey(k)); - if (prefix_extractor_->InDomain(user_k) && - !bloom_->MayContain(prefix_extractor_->Transform(user_k))) { + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts) && + !bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); valid_ = false; return; @@ -345,9 +349,11 @@ class MemTableIterator : public InternalIterator { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); if (bloom_) { - Slice user_k(ExtractUserKey(k)); - if (prefix_extractor_->InDomain(user_k) && - !bloom_->MayContain(prefix_extractor_->Transform(user_k))) { + auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + if (prefix_extractor_->InDomain(user_k_without_ts) && + !bloom_->MayContain( + prefix_extractor_->Transform(user_k_without_ts))) { PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); valid_ = false; return; @@ -480,10 +486,55 @@ MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, return {entry_count * (data_size / n), entry_count}; } -bool MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, /* user key */ - const Slice& value, bool allow_concurrent, - MemTablePostProcessInfo* post_process_info, void** hint) { +Status MemTable::VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOTS64& kv_prot_info) { + uint32_t ikey_len = 0; + if (!GetVarint32(&encoded, &ikey_len)) { + return Status::Corruption("Unable to parse internal key length"); + } + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (ikey_len < 8 + ts_sz) { + return Status::Corruption("Internal key length too short"); + } + if (ikey_len > encoded.size()) { + return Status::Corruption("Internal key length too long"); + } + uint32_t value_len = 0; + const size_t key_without_ts_len = ikey_len - ts_sz - 8; + Slice key(encoded.data(), key_without_ts_len); + encoded.remove_prefix(key_without_ts_len); + + Slice timestamp(encoded.data(), ts_sz); + encoded.remove_prefix(ts_sz); + + uint64_t packed = DecodeFixed64(encoded.data()); + ValueType value_type = kMaxValue; + SequenceNumber sequence_number = kMaxSequenceNumber; + UnPackSequenceAndType(packed, &sequence_number, &value_type); + encoded.remove_prefix(8); + + if (!GetVarint32(&encoded, &value_len)) { + return Status::Corruption("Unable to parse value length"); + } + if (value_len < encoded.size()) { + return Status::Corruption("Value length too short"); + } + if (value_len > encoded.size()) { + return Status::Corruption("Value length too long"); + } + Slice value(encoded.data(), value_len); + + return kv_prot_info.StripS(sequence_number) + .StripKVOT(key, value, value_type, timestamp) + .GetStatus(); +} + +Status MemTable::Add(SequenceNumber s, ValueType type, + const Slice& key, /* user key */ + const Slice& value, + const ProtectionInfoKVOTS64* kv_prot_info, + bool allow_concurrent, + MemTablePostProcessInfo* post_process_info, void** hint) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() // key bytes : char[internal_key.size()] @@ -510,7 +561,17 @@ bool MemTable::Add(SequenceNumber s, ValueType type, p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + if (kv_prot_info != nullptr) { + Slice encoded(buf, encoded_len); + TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded); + Status status = VerifyEncodedEntry(encoded, *kv_prot_info); + if (!status.ok()) { + return status; + } + } + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz); if (!allow_concurrent) { // Extract prefix for insert with hint. @@ -519,12 +580,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type, Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]); if (UNLIKELY(!res)) { - return res; + return Status::TryAgain("key+seq exists"); } } else { bool res = table->InsertKey(handle); if (UNLIKELY(!res)) { - return res; + return Status::TryAgain("key+seq exists"); } } @@ -540,11 +601,11 @@ bool MemTable::Add(SequenceNumber s, ValueType type, } if (bloom_filter_ && prefix_extractor_ && - prefix_extractor_->InDomain(key)) { - bloom_filter_->Add(prefix_extractor_->Transform(key)); + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz)); + bloom_filter_->Add(key_without_ts); } // The first sequence number inserted into the memtable @@ -565,7 +626,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, ? table->InsertKeyConcurrently(handle) : table->InsertKeyWithHintConcurrently(handle, hint); if (UNLIKELY(!res)) { - return res; + return Status::TryAgain("key+seq exists"); } assert(post_process_info != nullptr); @@ -576,11 +637,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type, } if (bloom_filter_ && prefix_extractor_ && - prefix_extractor_->InDomain(key)) { - bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); + prefix_extractor_->InDomain(key_without_ts)) { + bloom_filter_->AddConcurrently( + prefix_extractor_->Transform(key_without_ts)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz)); + bloom_filter_->AddConcurrently(key_without_ts); } // atomically update first_seqno_ and earliest_seqno_. @@ -599,7 +661,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, is_range_del_table_empty_.store(false, std::memory_order_relaxed); } UpdateOldestKeyTime(); - return true; + return Status::OK(); } // Callback from MemTable::Get() @@ -622,7 +684,8 @@ struct Saver { Statistics* statistics; bool inplace_update_support; bool do_merge; - Env* env_; + SystemClock* clock; + ReadCallback* callback_; bool* is_blob_index; bool allow_data_in_errors; @@ -660,8 +723,8 @@ static bool SaveValue(void* arg, const char* entry) { const Comparator* user_comparator = s->mem->GetInternalKeyComparator().user_comparator(); size_t ts_sz = user_comparator->timestamp_size(); - if (user_comparator->CompareWithoutTimestamp(user_key_slice, - s->key->user_key()) == 0) { + if (user_comparator->EqualWithoutTimestamp(user_key_slice, + s->key->user_key())) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; @@ -706,7 +769,7 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), &v, merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->env_, nullptr /* result_operand */, true); + s->statistics, s->clock, nullptr /* result_operand */, true); } } else { // Preserve the value with the goal of returning it as part of @@ -745,7 +808,7 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->env_, nullptr /* result_operand */, true); + s->statistics, s->clock, nullptr /* result_operand */, true); } } else { *(s->status) = Status::NotFound(); @@ -773,7 +836,7 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, merge_context->GetOperands(), s->value, s->logger, s->statistics, - s->env_, nullptr /* result_operand */, true); + s->clock, nullptr /* result_operand */, true); *(s->found_final_value) = true; return false; } @@ -820,22 +883,21 @@ bool MemTable::Get(const LookupKey& key, std::string* value, range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key())); } - Slice user_key = key.user_key(); bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); bool may_contain = true; size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz); if (bloom_filter_) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, // only do whole key filtering for Get() to save CPU if (moptions_.memtable_whole_key_filtering) { - may_contain = - bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz)); + may_contain = bloom_filter_->MayContain(user_key_without_ts); } else { assert(prefix_extractor_); - may_contain = - !prefix_extractor_->InDomain(user_key) || - bloom_filter_->MayContain(prefix_extractor_->Transform(user_key)); + may_contain = !prefix_extractor_->InDomain(user_key_without_ts) || + bloom_filter_->MayContain( + prefix_extractor_->Transform(user_key_without_ts)); } } @@ -882,7 +944,7 @@ void MemTable::GetFromTable(const LookupKey& key, saver.logger = moptions_.info_log; saver.inplace_update_support = moptions_.inplace_update_support; saver.statistics = moptions_.statistics; - saver.env_ = env_; + saver.clock = clock_; saver.callback_ = callback; saver.is_blob_index = is_blob_index; saver.do_merge = do_merge; @@ -892,7 +954,7 @@ void MemTable::GetFromTable(const LookupKey& key, } void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob) { + ReadCallback* callback) { // The sequence number is updated synchronously in version_set.h if (IsEmpty()) { // Avoiding recording stats for speed. @@ -908,16 +970,18 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, int num_keys = 0; for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { if (!prefix_extractor_) { - keys[num_keys++] = &iter->ukey; - } else if (prefix_extractor_->InDomain(iter->ukey)) { - prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey)); + keys[num_keys++] = &iter->ukey_without_ts; + } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) { + prefixes.emplace_back( + prefix_extractor_->Transform(iter->ukey_without_ts)); keys[num_keys++] = &prefixes.back(); } } bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]); int idx = 0; for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { - if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) { + if (prefix_extractor_ && + !prefix_extractor_->InDomain(iter->ukey_without_ts)) { PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); continue; } @@ -943,9 +1007,9 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key())); } GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, - callback, is_blob, iter->value->GetSelf(), iter->timestamp, - iter->s, &(iter->merge_context), &seq, &found_final_value, - &merge_in_progress); + callback, &iter->is_blob_index, iter->value->GetSelf(), + iter->timestamp, iter->s, &(iter->merge_context), &seq, + &found_final_value, &merge_in_progress); if (!found_final_value && merge_in_progress) { *(iter->s) = Status::MergeInProgress(); @@ -970,9 +1034,9 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, PERF_COUNTER_ADD(get_from_memtable_count, 1); } -void MemTable::Update(SequenceNumber seq, - const Slice& key, - const Slice& value) { +Status MemTable::Update(SequenceNumber seq, const Slice& key, + const Slice& value, + const ProtectionInfoKVOTS64* kv_prot_info) { LookupKey lkey(key, seq); Slice mem_key = lkey.memtable_key(); @@ -1016,22 +1080,26 @@ void MemTable::Update(SequenceNumber seq, (unsigned)(VarintLength(key_length) + key_length + VarintLength(value.size()) + value.size())); RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); - return; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + Slice encoded(entry, p + value.size() - entry); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } + return Status::OK(); } } } } - // key doesn't exist - bool add_res __attribute__((__unused__)); - add_res = Add(seq, kTypeValue, key, value); - // We already checked unused != seq above. In that case, Add should not fail. - assert(add_res); + // The latest value is not `kTypeValue` or key doesn't exist + return Add(seq, kTypeValue, key, value, kv_prot_info); } -bool MemTable::UpdateCallback(SequenceNumber seq, - const Slice& key, - const Slice& delta) { +Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOTS64* kv_prot_info) { LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); @@ -1057,8 +1125,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq, // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; - uint64_t unused; - UnPackSequenceAndType(tag, &unused, &type); + uint64_t existing_seq; + UnPackSequenceAndType(tag, &existing_seq, &type); switch (type) { case kTypeValue: { Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); @@ -1085,16 +1153,35 @@ bool MemTable::UpdateCallback(SequenceNumber seq, } RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); UpdateFlushState(); - return true; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info); + // `seq` is swallowed and `existing_seq` prevails. + updated_kv_prot_info.UpdateS(seq, existing_seq); + updated_kv_prot_info.UpdateV(delta, + Slice(prev_buffer, new_prev_size)); + Slice encoded(entry, prev_buffer + new_prev_size - entry); + return VerifyEncodedEntry(encoded, updated_kv_prot_info); + } + return Status::OK(); } else if (status == UpdateStatus::UPDATED) { - Add(seq, kTypeValue, key, Slice(str_value)); + Status s; + if (kv_prot_info != nullptr) { + ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(delta, str_value); + s = Add(seq, kTypeValue, key, Slice(str_value), + &updated_kv_prot_info); + } else { + s = Add(seq, kTypeValue, key, Slice(str_value), + nullptr /* kv_prot_info */); + } RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); UpdateFlushState(); - return true; + return s; } else if (status == UpdateStatus::UPDATE_FAILED) { - // No action required. Return. + // `UPDATE_FAILED` is named incorrectly. It indicates no update + // happened. It does not indicate a failure happened. UpdateFlushState(); - return true; + return Status::OK(); } } default: @@ -1102,9 +1189,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq, } } } - // If the latest value is not kTypeValue - // or key doesn't exist - return false; + // The latest value is not `kTypeValue` or key doesn't exist + return Status::NotFound(); } size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { diff --git a/db/memtable.h b/db/memtable.h index d5bd4e95aee..54155f9b575 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -15,7 +15,9 @@ #include #include #include + #include "db/dbformat.h" +#include "db/kv_checksum.h" #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" #include "db/version_edit.h" @@ -24,7 +26,6 @@ #include "monitoring/instrumented_mutex.h" #include "options/cf_options.h" #include "rocksdb/db.h" -#include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "table/multiget_context.h" #include "util/dynamic_bloom.h" @@ -36,9 +37,10 @@ struct FlushJobInfo; class Mutex; class MemTableIterator; class MergeContext; +class SystemClock; struct ImmutableMemTableOptions { - explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions, + explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options); size_t arena_block_size; uint32_t memtable_prefix_bloom_bits; @@ -70,7 +72,7 @@ using MultiGetRange = MultiGetContext::Range; // Note: Many of the methods in this class have comments indicating that // external synchronization is required as these methods are not thread-safe. // It is up to higher layers of code to decide how to prevent concurrent -// invokation of these methods. This is usually done by acquiring either +// invocation of these methods. This is usually done by acquiring either // the db mutex or the single writer thread. // // Some of these methods are documented to only require external @@ -101,7 +103,7 @@ class MemTable { // used, but this may prevent some transactions from succeeding until the // first key is inserted into the memtable. explicit MemTable(const InternalKeyComparator& comparator, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, WriteBufferManager* write_buffer_manager, SequenceNumber earliest_seq, uint32_t column_family_id); @@ -137,7 +139,7 @@ class MemTable { // operations on the same MemTable (unless this Memtable is immutable). size_t ApproximateMemoryUsage(); - // As a cheap version of `ApproximateMemoryUsage()`, this function doens't + // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't // require external synchronization. The value may be less accurate though size_t ApproximateMemoryUsageFast() const { return approximate_memory_usage_.load(std::memory_order_relaxed); @@ -175,6 +177,9 @@ class MemTable { FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options, SequenceNumber read_seq); + Status VerifyEncodedEntry(Slice encoded, + const ProtectionInfoKVOTS64& kv_prot_info); + // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. // Typically value will be empty if type==kTypeDeletion. @@ -182,12 +187,14 @@ class MemTable { // REQUIRES: if allow_concurrent = false, external synchronization to prevent // simultaneous operations on the same MemTable. // - // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and - // the already exists. - bool Add(SequenceNumber seq, ValueType type, const Slice& key, - const Slice& value, bool allow_concurrent = false, - MemTablePostProcessInfo* post_process_info = nullptr, - void** hint = nullptr); + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. + Status Add(SequenceNumber seq, ValueType type, const Slice& key, + const Slice& value, const ProtectionInfoKVOTS64* kv_prot_info, + bool allow_concurrent = false, + MemTablePostProcessInfo* post_process_info = nullptr, + void** hint = nullptr); // Used to Get value associated with key or Get Merge Operands associated // with key. @@ -237,37 +244,38 @@ class MemTable { } void MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob); - - // Attempts to update the new_value inplace, else does normal Add - // Pseudocode - // if key exists in current memtable && prev_value is of type kTypeValue - // if new sizeof(new_value) <= sizeof(prev_value) - // update inplace - // else add(key, new_value) - // else add(key, new_value) + ReadCallback* callback); + + // If `key` exists in current memtable with type `kTypeValue` and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // adds the new value to the memtable out-of-place. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. // // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - void Update(SequenceNumber seq, - const Slice& key, - const Slice& value); - - // If prev_value for key exists, attempts to update it inplace. - // else returns false - // Pseudocode - // if key exists in current memtable && prev_value is of type kTypeValue - // new_value = delta(prev_value) - // if sizeof(new_value) <= sizeof(prev_value) - // update inplace - // else add(key, new_value) - // else return false + Status Update(SequenceNumber seq, const Slice& key, const Slice& value, + const ProtectionInfoKVOTS64* kv_prot_info); + + // If `key` exists in current memtable with type `kTypeValue` and the existing + // value is at least as large as the new value, updates it in-place. Otherwise + // if `key` exists in current memtable with type `kTypeValue`, adds the new + // value to the memtable out-of-place. + // + // Returns `Status::NotFound` if `key` does not exist in current memtable or + // the latest version of `key` does not have `kTypeValue`. + // + // Returns `Status::TryAgain` if the `seq`, `key` combination already exists + // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true. + // The next attempt should try a larger value for `seq`. // // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - bool UpdateCallback(SequenceNumber seq, - const Slice& key, - const Slice& delta); + Status UpdateCallback(SequenceNumber seq, const Slice& key, + const Slice& delta, + const ProtectionInfoKVOTS64* kv_prot_info); // Returns the number of successive merge entries starting from the newest // entry for the key up to the last non-merge entry or last entry for the @@ -504,7 +512,7 @@ class MemTable { std::atomic flush_state_; - Env* env_; + SystemClock* clock_; // Extract sequential insert prefixes. const SliceTransform* insert_with_hint_prefix_extractor_; @@ -525,7 +533,7 @@ class MemTable { SequenceNumber atomic_flush_seqno_; // keep track of memory usage in table_, arena_, and range_del_table_. - // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` + // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` std::atomic approximate_memory_usage_; #ifndef ROCKSDB_LITE diff --git a/db/memtable_list.cc b/db/memtable_list.cc index dced9f7db7a..97d076b0377 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -113,10 +113,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, } void MemTableListVersion::MultiGet(const ReadOptions& read_options, - MultiGetRange* range, ReadCallback* callback, - bool* is_blob) { + MultiGetRange* range, + ReadCallback* callback) { for (auto memtable : memlist_) { - memtable->MultiGet(read_options, range, callback, is_blob); + memtable->MultiGet(read_options, range, callback); if (range->empty()) { return; } @@ -334,7 +334,7 @@ bool MemTableList::IsFlushPending() const { } // Returns the memtables that need to be flushed. -void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, +void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, autovector* ret) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); @@ -345,7 +345,7 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { atomic_flush = true; } - if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) { + if (m->GetID() > max_memtable_id) { break; } if (!m->flush_in_progress_) { @@ -473,91 +473,42 @@ Status MemTableList::TryInstallMemtableFlushResults( // TODO(myabandeh): Not sure how batch_count could be 0 here. if (batch_count > 0) { + uint64_t min_wal_number_to_keep = 0; if (vset->db_options()->allow_2pc) { assert(edit_list.size() > 0); + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, *cfd, edit_list, memtables_to_flush, prep_tracker); // We piggyback the information of earliest log file to keep in the // manifest entry for the last file flushed. - edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep( - vset, *cfd, edit_list, memtables_to_flush, prep_tracker)); + edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep); } - // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, - db_directory); - *io_s = vset->io_status(); - - // we will be changing the version in the next code path, - // so we better create a new one, since versions are immutable - InstallNewVersion(); - - // All the later memtables that have the same filenum - // are part of the same batch. They can be committed now. - uint64_t mem_id = 1; // how many memtables have been flushed. - - // commit new state only if the column family is NOT dropped. - // The reason is as follows (refer to - // ColumnFamilyTest.FlushAndDropRaceCondition). - // If the column family is dropped, then according to LogAndApply, its - // corresponding flush operation is NOT written to the MANIFEST. This - // means the DB is not aware of the L0 files generated from the flush. - // By committing the new state, we remove the memtable from the memtable - // list. Creating an iterator on this column family will not be able to - // read full data since the memtable is removed, and the DB is not aware - // of the L0 files, causing MergingIterator unable to build child - // iterators. RocksDB contract requires that the iterator can be created - // on a dropped column family, and we must be able to - // read full data as long as column family handle is not deleted, even if - // the column family is dropped. - if (s.ok() && !cfd->IsDropped()) { // commit new state - while (batch_count-- > 0) { - MemTable* m = current_->memlist_.back(); - if (m->edit_.GetBlobFileAdditions().empty()) { - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " done", - cfd->GetName().c_str(), m->file_number_, mem_id); - } else { - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 - " (+%zu blob files)" - ": memtable #%" PRIu64 " done", - cfd->GetName().c_str(), m->file_number_, - m->edit_.GetBlobFileAdditions().size(), mem_id); - } - - assert(m->file_number_ > 0); - current_->Remove(m, to_delete); - UpdateCachedValuesFromMemTableListVersion(); - ResetTrimHistoryNeeded(); - ++mem_id; + std::unique_ptr wal_deletion; + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (!vset->db_options()->allow_2pc) { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list); } - } else { - for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { - MemTable* m = *it; - // commit failed. setup state so that we can flush again. - if (m->edit_.GetBlobFileAdditions().empty()) { - ROCKS_LOG_BUFFER(log_buffer, - "Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " failed", - m->file_number_, mem_id); - } else { - ROCKS_LOG_BUFFER(log_buffer, - "Level-0 commit table #%" PRIu64 - " (+%zu blob files)" - ": memtable #%" PRIu64 " failed", - m->file_number_, - m->edit_.GetBlobFileAdditions().size(), mem_id); - } - - m->flush_completed_ = false; - m->flush_in_progress_ = false; - m->edit_.Clear(); - num_flush_not_started_++; - m->file_number_ = 0; - imm_flush_needed.store(true, std::memory_order_release); - ++mem_id; + if (min_wal_number_to_keep > + vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.reset(new VersionEdit); + wal_deletion->DeleteWalsBefore(min_wal_number_to_keep); + edit_list.push_back(wal_deletion.get()); } } + + const auto manifest_write_cb = [this, cfd, batch_count, log_buffer, + to_delete, mu](const Status& status) { + RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer, + to_delete, mu); + }; + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, + db_directory, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, + manifest_write_cb); + *io_s = vset->io_status(); } } commit_in_progress_ = false; @@ -570,7 +521,7 @@ void MemTableList::Add(MemTable* m, autovector* to_delete) { InstallNewVersion(); // this method is used to move mutable memtable into an immutable list. // since mutable memtable is already refcounted by the DBImpl, - // and when moving to the imutable list we don't unref it, + // and when moving to the immutable list we don't unref it, // we don't have to ref the memtable here. we just take over the // reference from the DBImpl. current_->Add(m, to_delete); @@ -642,21 +593,93 @@ void MemTableList::InstallNewVersion() { } } +void MemTableList::RemoveMemTablesOrRestoreFlags( + const Status& s, ColumnFamilyData* cfd, size_t batch_count, + LogBuffer* log_buffer, autovector* to_delete, + InstrumentedMutex* mu) { + assert(mu); + mu->AssertHeld(); + assert(to_delete); + // we will be changing the version in the next code path, + // so we better create a new one, since versions are immutable + InstallNewVersion(); + + // All the later memtables that have the same filenum + // are part of the same batch. They can be committed now. + uint64_t mem_id = 1; // how many memtables have been flushed. + + // commit new state only if the column family is NOT dropped. + // The reason is as follows (refer to + // ColumnFamilyTest.FlushAndDropRaceCondition). + // If the column family is dropped, then according to LogAndApply, its + // corresponding flush operation is NOT written to the MANIFEST. This + // means the DB is not aware of the L0 files generated from the flush. + // By committing the new state, we remove the memtable from the memtable + // list. Creating an iterator on this column family will not be able to + // read full data since the memtable is removed, and the DB is not aware + // of the L0 files, causing MergingIterator unable to build child + // iterators. RocksDB contract requires that the iterator can be created + // on a dropped column family, and we must be able to + // read full data as long as column family handle is not deleted, even if + // the column family is dropped. + if (s.ok() && !cfd->IsDropped()) { // commit new state + while (batch_count-- > 0) { + MemTable* m = current_->memlist_.back(); + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + assert(m->file_number_ > 0); + current_->Remove(m, to_delete); + UpdateCachedValuesFromMemTableListVersion(); + ResetTrimHistoryNeeded(); + ++mem_id; + } + } else { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { + MemTable* m = *it; + // commit failed. setup state so that we can flush again. + if (m->edit_.GetBlobFileAdditions().empty()) { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64 + " failed", + m->file_number_, mem_id); + } else { + ROCKS_LOG_BUFFER(log_buffer, + "Level-0 commit table #%" PRIu64 + " (+%zu blob files)" + ": memtable #%" PRIu64 " failed", + m->file_number_, + m->edit_.GetBlobFileAdditions().size(), mem_id); + } + + m->flush_completed_ = false; + m->flush_in_progress_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + m->file_number_ = 0; + imm_flush_needed.store(true, std::memory_order_release); + ++mem_id; + } + } +} + uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( - const autovector& memtables_to_flush) { + const std::unordered_set* memtables_to_flush) { uint64_t min_log = 0; for (auto& m : current_->memlist_) { - // Assume the list is very short, we can live with O(m*n). We can optimize - // if the performance has some problem. - bool should_skip = false; - for (MemTable* m_to_flush : memtables_to_flush) { - if (m == m_to_flush) { - should_skip = true; - break; - } - } - if (should_skip) { + if (memtables_to_flush && memtables_to_flush->count(m)) { continue; } @@ -676,7 +699,8 @@ Status InstallMemtableAtomicFlushResults( const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, - InstrumentedMutex* mu, const autovector& file_metas, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_metas, autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer) { AutoThreadOperationStageUpdater stage_updater( @@ -688,6 +712,10 @@ Status InstallMemtableAtomicFlushResults( if (imm_lists != nullptr) { assert(imm_lists->size() == num); } + if (num == 0) { + return Status::OK(); + } + for (size_t k = 0; k != num; ++k) { #ifndef NDEBUG const auto* imm = @@ -716,12 +744,37 @@ Status InstallMemtableAtomicFlushResults( ++num_entries; edit_lists.emplace_back(edits); } + + WalNumber min_wal_number_to_keep = 0; + if (vset->db_options()->allow_2pc) { + min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC( + vset, cfds, edit_lists, mems_list, prep_tracker); + edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep); + } + + std::unique_ptr wal_deletion; + if (vset->db_options()->track_and_verify_wals_in_manifest) { + if (!vset->db_options()->allow_2pc) { + min_wal_number_to_keep = + PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists); + } + if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) { + wal_deletion.reset(new VersionEdit); + wal_deletion->DeleteWalsBefore(min_wal_number_to_keep); + edit_lists.back().push_back(wal_deletion.get()); + ++num_entries; + } + } + // Mark the version edits as an atomic group if the number of version edits // exceeds 1. if (cfds.size() > 1) { - for (auto& edits : edit_lists) { - assert(edits.size() == 1); - edits[0]->MarkAtomicGroup(--num_entries); + for (size_t i = 0; i < edit_lists.size(); i++) { + assert((edit_lists[i].size() == 1) || + ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1))); + for (auto& e : edit_lists[i]) { + e->MarkAtomicGroup(--num_entries); + } } assert(0 == num_entries); } diff --git a/db/memtable_list.h b/db/memtable_list.h index 72105d26667..493a54d4034 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -76,7 +76,7 @@ class MemTableListVersion { } void MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob); + ReadCallback* callback); // Returns all the merge operands corresponding to the key by searching all // memtables starting from the most recent one. @@ -138,8 +138,8 @@ class MemTableListVersion { const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, - VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer); @@ -251,7 +251,7 @@ class MemTableList { // Returns the earliest memtables that needs to be flushed. The returned // memtables are guaranteed to be in the ascending order of created time. - void PickMemtablesToFlush(const uint64_t* max_memtable_id, + void PickMemtablesToFlush(uint64_t max_memtable_id, autovector* mems); // Reset status of the given memtable list back to pending state so that @@ -335,7 +335,7 @@ class MemTableList { // Returns the min log containing the prep section after memtables listsed in // `memtables_to_flush` are flushed and their status is persisted in manifest. uint64_t PrecomputeMinLogContainingPrepSection( - const autovector& memtables_to_flush); + const std::unordered_set* memtables_to_flush = nullptr); uint64_t GetEarliestMemTableID() const { auto& memlist = current_->memlist_; @@ -381,14 +381,21 @@ class MemTableList { const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, - VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, + VersionSet* vset, LogsWithPrepTracker* prep_tracker, + InstrumentedMutex* mu, const autovector& file_meta, autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer); // DB mutex held void InstallNewVersion(); + // DB mutex held + // Called after writing to MANIFEST + void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd, + size_t batch_count, LogBuffer* log_buffer, + autovector* to_delete, + InstrumentedMutex* mu); + const int min_write_buffer_number_to_merge_; MemTableListVersion* current_; @@ -424,7 +431,8 @@ extern Status InstallMemtableAtomicFlushResults( const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, - InstrumentedMutex* mu, const autovector& file_meta, + LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu, + const autovector& file_meta, autovector* to_delete, FSDirectory* db_directory, LogBuffer* log_buffer); } // namespace ROCKSDB_NAMESPACE diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index cc6e566ad57..165471b6bbb 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -30,14 +30,14 @@ class MemTableListTest : public testing::Test { MemTableListTest() : db(nullptr), file_number(1) { dbname = test::PerThreadDBPath("memtable_list_test"); options.create_if_missing = true; - DestroyDB(dbname, options); + EXPECT_OK(DestroyDB(dbname, options)); } // Create a test db if not yet created void CreateDB() { if (db == nullptr) { options.create_if_missing = true; - DestroyDB(dbname, options); + EXPECT_OK(DestroyDB(dbname, options)); // Open DB only with default column family ColumnFamilyOptions cf_options; std::vector cf_descs; @@ -78,7 +78,7 @@ class MemTableListTest : public testing::Test { handles.clear(); delete db; db = nullptr; - DestroyDB(dbname, options, cf_descs); + EXPECT_OK(DestroyDB(dbname, options, cf_descs)); } } @@ -103,7 +103,7 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, - /*io_tracer=*/nullptr); + /*io_tracer=*/nullptr, /*db_session_id*/ ""); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -153,7 +153,7 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, - /*io_tracer=*/nullptr); + /*io_tracer=*/nullptr, /*db_session_id*/ ""); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -185,8 +185,9 @@ class MemTableListTest : public testing::Test { InstrumentedMutex mutex; InstrumentedMutexLock l(&mutex); return InstallMemtableAtomicFlushResults( - &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex, - file_meta_ptrs, to_delete, nullptr, &log_buffer); + &lists, cfds, mutable_cf_options_list, mems_list, &versions, + nullptr /* prep_tracker */, &mutex, file_meta_ptrs, to_delete, nullptr, + &log_buffer); } }; @@ -199,7 +200,7 @@ TEST_F(MemTableListTest, Empty) { ASSERT_FALSE(list.IsFlushPending()); autovector mems; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &mems); ASSERT_EQ(0, mems.size()); autovector to_delete; @@ -234,7 +235,7 @@ TEST_F(MemTableListTest, GetTest) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, @@ -242,10 +243,14 @@ TEST_F(MemTableListTest, GetTest) { mem->Ref(); // Write some keys to this memtable. - mem->Add(++seq, kTypeDeletion, "key1", ""); - mem->Add(++seq, kTypeValue, "key2", "value2"); - mem->Add(++seq, kTypeValue, "key1", "value1"); - mem->Add(++seq, kTypeValue, "key2", "value2.2"); + ASSERT_OK( + mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", + nullptr /* kv_prot_info */)); // Fetch the newly written keys merge_context.Clear(); @@ -283,8 +288,10 @@ TEST_F(MemTableListTest, GetTest) { kMaxSequenceNumber, 0 /* column_family_id */); mem2->Ref(); - mem2->Add(++seq, kTypeDeletion, "key1", ""); - mem2->Add(++seq, kTypeValue, "key2", "value2.3"); + ASSERT_OK( + mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3", + nullptr /* kv_prot_info */)); // Add second memtable to list list.Add(mem2, &to_delete); @@ -351,7 +358,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, @@ -359,9 +366,12 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { mem->Ref(); // Write some keys to this memtable. - mem->Add(++seq, kTypeDeletion, "key1", ""); - mem->Add(++seq, kTypeValue, "key2", "value2"); - mem->Add(++seq, kTypeValue, "key2", "value2.2"); + ASSERT_OK( + mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2", + nullptr /* kv_prot_info */)); // Fetch the newly written keys merge_context.Clear(); @@ -399,7 +409,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Flush this memtable from the list. // (It will then be a part of the memtable history). autovector to_flush; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(1, to_flush.size()); MutableCFOptions mutable_cf_options(options); @@ -443,15 +453,17 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { kMaxSequenceNumber, 0 /* column_family_id */); mem2->Ref(); - mem2->Add(++seq, kTypeDeletion, "key1", ""); - mem2->Add(++seq, kTypeValue, "key3", "value3"); + ASSERT_OK( + mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */)); + ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3", + nullptr /* kv_prot_info */)); // Add second memtable to list list.Add(mem2, &to_delete); ASSERT_EQ(0, to_delete.size()); to_flush.clear(); - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(1, to_flush.size()); // Flush second memtable @@ -527,7 +539,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); InternalKeyComparator cmp(BytewiseComparator()); WriteBufferManager wb(options.db_write_buffer_size); autovector to_delete; @@ -554,11 +566,16 @@ TEST_F(MemTableListTest, FlushPendingTest) { std::string value; MergeContext merge_context; - mem->Add(++seq, kTypeValue, "key1", ToString(i)); - mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); - mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); - mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); - mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", + nullptr /* kv_prot_info */)); tables.push_back(mem); } @@ -567,7 +584,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); autovector to_flush; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(0, to_flush.size()); // Request a flush even though there is nothing to flush @@ -576,7 +593,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); // Attempt to 'flush' to clear request for flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(0, to_flush.size()); ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -600,7 +617,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); // Pick tables to flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(2, to_flush.size()); ASSERT_EQ(2, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -621,7 +638,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_EQ(0, to_delete.size()); // Pick tables to flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); ASSERT_EQ(3, to_flush.size()); ASSERT_EQ(3, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -629,7 +646,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Pick tables to flush again autovector to_flush2; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); ASSERT_EQ(0, to_flush2.size()); ASSERT_EQ(3, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -647,7 +664,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); // Pick tables to flush again - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2); ASSERT_EQ(1, to_flush2.size()); ASSERT_EQ(4, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -668,7 +685,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_EQ(0, to_delete.size()); // Pick tables to flush - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush); // Should pick 4 of 5 since 1 table has been picked in to_flush2 ASSERT_EQ(4, to_flush.size()); ASSERT_EQ(5, list.NumNotFlushed()); @@ -677,7 +694,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Pick tables to flush again autovector to_flush3; - list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3); + list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush3); ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); @@ -738,7 +755,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { autovector to_flush4; list.FlushRequested(); ASSERT_TRUE(list.HasFlushRequested()); - list.PickMemtablesToFlush(&memtable_id, &to_flush4); + list.PickMemtablesToFlush(memtable_id, &to_flush4); ASSERT_TRUE(to_flush4.empty()); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -749,7 +766,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { // equal to 5. Therefore, only tables[5] will be selected. memtable_id = 5; list.FlushRequested(); - list.PickMemtablesToFlush(&memtable_id, &to_flush4); + list.PickMemtablesToFlush(memtable_id, &to_flush4); ASSERT_EQ(1, static_cast(to_flush4.size())); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -791,7 +808,7 @@ TEST_F(MemTableListTest, AtomicFlusTest) { auto factory = std::make_shared(); options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); InternalKeyComparator cmp(BytewiseComparator()); WriteBufferManager wb(options.db_write_buffer_size); @@ -823,11 +840,16 @@ TEST_F(MemTableListTest, AtomicFlusTest) { std::string value; - mem->Add(++seq, kTypeValue, "key1", ToString(i)); - mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); - mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); - mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); - mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); + ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i), + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM", + nullptr /* kv_prot_info */)); + ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "", + nullptr /* kv_prot_info */)); elem.push_back(mem); } @@ -841,7 +863,8 @@ TEST_F(MemTableListTest, AtomicFlusTest) { auto* list = lists[i]; ASSERT_FALSE(list->IsFlushPending()); ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); - list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]); + list->PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, + &flush_candidates[i]); ASSERT_EQ(0, flush_candidates[i].size()); } // Request flush even though there is nothing to flush @@ -871,8 +894,7 @@ TEST_F(MemTableListTest, AtomicFlusTest) { // Pick memtables to flush for (auto i = 0; i != num_cfs; ++i) { flush_candidates[i].clear(); - lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i], - &flush_candidates[i]); + lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]); ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, static_cast(flush_candidates[i].size())); } diff --git a/db/merge_context.h b/db/merge_context.h index e1869a341b0..925bfc0e068 100644 --- a/db/merge_context.h +++ b/db/merge_context.h @@ -68,7 +68,7 @@ class MergeContext { } // Get the operand at the index. - Slice GetOperand(int index) { + Slice GetOperand(int index) const { assert(operand_list_); SetDirectionForward(); @@ -76,13 +76,21 @@ class MergeContext { } // Same as GetOperandsDirectionForward - const std::vector& GetOperands() { + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperands() const { return GetOperandsDirectionForward(); } // Return all the operands in the order as they were merged (passed to // FullMerge or FullMergeV2) - const std::vector& GetOperandsDirectionForward() { + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionForward() const { if (!operand_list_) { return empty_operand_list; } @@ -93,7 +101,11 @@ class MergeContext { // Return all the operands in the reversed order relative to how they were // merged (passed to FullMerge or FullMergeV2) - const std::vector& GetOperandsDirectionBackward() { + // + // Note that the returned reference is only good until another call + // to this MergeContext. If the returned value is needed for longer, + // a copy must be made. + const std::vector& GetOperandsDirectionBackward() const { if (!operand_list_) { return empty_operand_list; } @@ -110,14 +122,14 @@ class MergeContext { } } - void SetDirectionForward() { + void SetDirectionForward() const { if (operands_reversed_ == true) { std::reverse(operand_list_->begin(), operand_list_->end()); operands_reversed_ = false; } } - void SetDirectionBackward() { + void SetDirectionBackward() const { if (operands_reversed_ == false) { std::reverse(operand_list_->begin(), operand_list_->end()); operands_reversed_ = true; @@ -125,10 +137,10 @@ class MergeContext { } // List of operands - std::unique_ptr> operand_list_; + mutable std::unique_ptr> operand_list_; // Copy of operands that are not pinned. std::unique_ptr>> copied_operands_; - bool operands_reversed_ = true; + mutable bool operands_reversed_ = true; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/merge_helper.cc b/db/merge_helper.cc index ebfd22a7dc3..31cd3b6c587 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -14,6 +14,7 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" #include "table/format.h" #include "table/internal_iterator.h" @@ -28,6 +29,7 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, Statistics* stats, const std::atomic* shutting_down) : env_(env), + clock_(env->GetSystemClock().get()), user_comparator_(user_comparator), user_merge_operator_(user_merge_operator), compaction_filter_(compaction_filter), @@ -39,7 +41,7 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, snapshot_checker_(snapshot_checker), level_(level), keys_(), - filter_timer_(env_), + filter_timer_(clock_), total_filter_time_(0U), stats_(stats) { assert(user_comparator_ != nullptr); @@ -52,7 +54,7 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, const Slice& key, const Slice* value, const std::vector& operands, std::string* result, Logger* logger, - Statistics* statistics, Env* env, + Statistics* statistics, SystemClock* clock, Slice* result_operand, bool update_num_ops_stats) { assert(merge_operator != nullptr); @@ -75,7 +77,7 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); { // Setup to time the merge - StopWatchNano timer(env, statistics != nullptr); + StopWatchNano timer(clock, statistics != nullptr); PERF_TIMER_GUARD(merge_operator_time_nanos); // Do the merge @@ -116,7 +118,8 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, Status MergeHelper::MergeUntil(InternalIterator* iter, CompactionRangeDelAggregator* range_del_agg, const SequenceNumber stop_before, - const bool at_bottom) { + const bool at_bottom, + const bool allow_data_in_errors) { // Get a copy of the internal key, before it's invalidated by iter->Next() // Also maintain the list of merge operands seen. assert(HasOperator()); @@ -139,7 +142,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // orig_ikey is backed by keys_.back() if !keys_.empty() ParsedInternalKey orig_ikey; - Status s = ParseInternalKey(original_key, &orig_ikey); + Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors); assert(s.ok()); if (!s.ok()) return s; @@ -153,12 +156,12 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, ParsedInternalKey ikey; assert(keys_.size() == merge_context_.GetNumOperands()); - if (ParseInternalKey(iter->key(), &ikey) != Status::OK()) { + Status pik_status = + ParseInternalKey(iter->key(), &ikey, allow_data_in_errors); + if (!pik_status.ok()) { // stop at corrupted key if (assert_valid_internal_key_) { - assert(!"Corrupted internal key not expected."); - s = Status::Corruption("Corrupted internal key not expected."); - return s; + return pik_status; } break; } else if (first_key) { @@ -212,7 +215,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, std::string merge_result; s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, merge_context_.GetOperands(), &merge_result, logger_, - stats_, env_); + stats_, clock_); // We store the result in keys_.back() and operands_.back() // if nothing went wrong (i.e.: no operand corruption on disk) @@ -267,9 +270,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, if (keys_.size() == 1) { // we need to re-anchor the orig_ikey because it was anchored by // original_key before - Status pikStatus = ParseInternalKey(keys_.back(), &orig_ikey); - pikStatus.PermitUncheckedError(); - assert(pikStatus.ok()); + pik_status = + ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); } if (filter == CompactionFilter::Decision::kKeep) { merge_context_.PushOperand( @@ -322,7 +326,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, std::string merge_result; s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, merge_context_.GetOperands(), &merge_result, logger_, - stats_, env_); + stats_, clock_); if (s.ok()) { // The original key encountered // We are certain that keys_ is not empty here (see assertions couple of @@ -345,7 +349,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, bool merge_success = false; std::string merge_result; { - StopWatchNano timer(env_, stats_ != nullptr); + StopWatchNano timer(clock_, stats_ != nullptr); PERF_TIMER_GUARD(merge_operator_time_nanos); merge_success = user_merge_operator_->PartialMergeMulti( orig_ikey.user_key, diff --git a/db/merge_helper.h b/db/merge_helper.h index c0534f08ba4..f3bcd948b44 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -25,6 +25,7 @@ class Iterator; class Logger; class MergeOperator; class Statistics; +class SystemClock; class MergeHelper { public: @@ -48,7 +49,7 @@ class MergeHelper { const Slice& key, const Slice* value, const std::vector& operands, std::string* result, Logger* logger, - Statistics* statistics, Env* env, + Statistics* statistics, SystemClock* clock, Slice* result_operand = nullptr, bool update_num_ops_stats = false); @@ -66,6 +67,8 @@ class MergeHelper { // 0 means no restriction // at_bottom: (IN) true if the iterator covers the bottem level, which means // we could reach the start of the history of this user key. + // allow_data_in_errors: (IN) if true, data details will be displayed in + // error/log messages. // // Returns one of the following statuses: // - OK: Entries were successfully merged. @@ -80,7 +83,8 @@ class MergeHelper { Status MergeUntil(InternalIterator* iter, CompactionRangeDelAggregator* range_del_agg = nullptr, const SequenceNumber stop_before = 0, - const bool at_bottom = false); + const bool at_bottom = false, + const bool allow_data_in_errors = false); // Filters a merge operand using the compaction filter specified // in the constructor. Returns the decision that the filter made. @@ -137,6 +141,7 @@ class MergeHelper { private: Env* env_; + SystemClock* clock_; const Comparator* user_comparator_; const MergeOperator* user_merge_operator_; const CompactionFilter* compaction_filter_; diff --git a/db/merge_test.cc b/db/merge_test.cc index 76716aefab0..2cca0735ef4 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -4,8 +4,9 @@ // (found in the LICENSE.Apache file in the root directory). // #include -#include + #include +#include #include "db/db_impl/db_impl.h" #include "db/dbformat.h" @@ -18,6 +19,7 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" #include "test_util/testharness.h" +#include "util/coding.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { @@ -97,9 +99,9 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, options.create_if_missing = true; options.merge_operator = std::make_shared(); options.max_successive_merges = max_successive_merges; - Status s; options.env = EnvMergeTest::GetInstance(); - DestroyDB(dbname, Options()); + EXPECT_OK(DestroyDB(dbname, Options())); + Status s; // DBWithTTL is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE if (ttl) { @@ -113,10 +115,8 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, assert(!ttl); s = DB::Open(options, dbname, &db); #endif // !ROCKSDB_LITE - if (!s.ok()) { - std::cerr << s.ToString() << std::endl; - assert(false); - } + EXPECT_OK(s); + assert(s.ok()); return std::shared_ptr(db); } @@ -271,21 +271,25 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) { counters.assert_set("a", 1); - if (test_compaction) db->Flush(o); + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + } - assert(counters.assert_get("a") == 1); + ASSERT_EQ(counters.assert_get("a"), 1); counters.assert_remove("b"); // defaut value is 0 if non-existent - assert(counters.assert_get("b") == 0); + ASSERT_EQ(counters.assert_get("b"), 0); counters.assert_add("a", 2); - if (test_compaction) db->Flush(o); + if (test_compaction) { + ASSERT_OK(db->Flush(o)); + } // 1+2 = 3 - assert(counters.assert_get("a")== 3); + ASSERT_EQ(counters.assert_get("a"), 3); dumpDb(db); @@ -295,22 +299,112 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) { counters.assert_add("b", i); sum += i; } - assert(counters.assert_get("b") == sum); + ASSERT_EQ(counters.assert_get("b"), sum); dumpDb(db); if (test_compaction) { - db->Flush(o); + ASSERT_OK(db->Flush(o)); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); dumpDb(db); - assert(counters.assert_get("a")== 3); - assert(counters.assert_get("b") == sum); + ASSERT_EQ(counters.assert_get("a"), 3); + ASSERT_EQ(counters.assert_get("b"), sum); } } +void testCountersWithFlushAndCompaction(Counters& counters, DB* db) { + ASSERT_OK(db->Put({}, "1", "1")); + ASSERT_OK(db->Flush(FlushOptions())); + + std::atomic cnt{0}; + const auto get_thread_id = [&cnt]() { + thread_local int thread_id{cnt++}; + return thread_id; + }; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) { + int thread_id = get_thread_id(); + if (1 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_compact_thread:0"); + } else if (2 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:0"); + } + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) { + int thread_id = get_thread_id(); + if (0 == thread_id) { + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::set_options_thread:0"); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::set_options_thread:1"); + } + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) { + auto* mutex = reinterpret_cast(arg); + mutex->AssertHeld(); + int thread_id = get_thread_id(); + ASSERT_EQ(2, thread_id); + mutex->Unlock(); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:1"); + TEST_SYNC_POINT( + "testCountersWithFlushAndCompaction::bg_flush_thread:2"); + mutex->Lock(); + }); + SyncPoint::GetInstance()->LoadDependency({ + {"testCountersWithFlushAndCompaction::set_options_thread:0", + "testCountersWithCompactionAndFlush:BeforeCompact"}, + {"testCountersWithFlushAndCompaction::bg_compact_thread:0", + "testCountersWithFlushAndCompaction:BeforeIncCounters"}, + {"testCountersWithFlushAndCompaction::bg_flush_thread:0", + "testCountersWithFlushAndCompaction::set_options_thread:1"}, + {"testCountersWithFlushAndCompaction::bg_flush_thread:1", + "testCountersWithFlushAndCompaction:BeforeVerification"}, + {"testCountersWithFlushAndCompaction:AfterGet", + "testCountersWithFlushAndCompaction::bg_flush_thread:2"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + port::Thread set_options_thread([&]() { + ASSERT_OK(reinterpret_cast(db)->SetOptions( + {{"disable_auto_compactions", "false"}})); + }); + TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact"); + port::Thread compact_thread([&]() { + ASSERT_OK(reinterpret_cast(db)->CompactRange( + CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr)); + }); + + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters"); + counters.add("test-key", 1); + + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db->Flush(flush_opts)); + + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification"); + std::string expected; + PutFixed64(&expected, 1); + std::string actual; + Status s = db->Get(ReadOptions(), "test-key", &actual); + TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet"); + set_options_thread.join(); + compact_thread.join(); + ASSERT_OK(s); + ASSERT_EQ(expected, actual); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + void testSuccessiveMerge(Counters& counters, size_t max_num_merges, size_t num_merges) { counters.assert_remove("z"); @@ -322,14 +416,14 @@ void testSuccessiveMerge(Counters& counters, size_t max_num_merges, sum += i; if (i % (max_num_merges + 1) == 0) { - assert(num_merge_operator_calls == max_num_merges + 1); + ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1); } else { - assert(num_merge_operator_calls == 0); + ASSERT_EQ(num_merge_operator_calls, 0); } resetNumMergeOperatorCalls(); - assert(counters.assert_get("z") == sum); - assert(num_merge_operator_calls == i % (max_num_merges + 1)); + ASSERT_EQ(counters.assert_get("z"), sum); + ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1)); } } @@ -346,8 +440,8 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge, counters->assert_add("b", i); tmp_sum += i; } - db->Flush(o); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->Flush(o)); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(tmp_sum, counters->assert_get("b")); if (count > max_merge) { // in this case, FullMerge should be called instead. @@ -360,13 +454,13 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge, // Test case 2: partial merge should not be called when a put is found. resetNumPartialMergeCalls(); tmp_sum = 0; - db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"); + ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10")); for (size_t i = 1; i <= count; i++) { counters->assert_add("c", i); tmp_sum += i; } - db->Flush(o); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->Flush(o)); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(tmp_sum, counters->assert_get("c")); ASSERT_EQ(num_partial_merge_calls, 0U); ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U); @@ -374,7 +468,7 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge, void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, size_t num_merges) { - assert(num_merges > max_num_merges); + ASSERT_GT(num_merges, max_num_merges); Slice key("BatchSuccessiveMerge"); uint64_t merge_value = 1; @@ -385,15 +479,12 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, // Create the batch WriteBatch batch; for (size_t i = 0; i < num_merges; ++i) { - batch.Merge(key, merge_value_slice); + ASSERT_OK(batch.Merge(key, merge_value_slice)); } // Apply to memtable and count the number of merges resetNumMergeOperatorCalls(); - { - Status s = db->Write(WriteOptions(), &batch); - assert(s.ok()); - } + ASSERT_OK(db->Write(WriteOptions(), &batch)); ASSERT_EQ( num_merge_operator_calls, static_cast(num_merges - (num_merges % (max_num_merges + 1)))); @@ -401,10 +492,7 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, // Get the value resetNumMergeOperatorCalls(); std::string get_value_str; - { - Status s = db->Get(ReadOptions(), key, &get_value_str); - assert(s.ok()); - } + ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str)); assert(get_value_str.size() == sizeof(uint64_t)); uint64_t get_value = DecodeFixed64(&get_value_str[0]); ASSERT_EQ(get_value, num_merges * merge_value); @@ -427,7 +515,7 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { } } - DestroyDB(dbname, Options()); + ASSERT_OK(DestroyDB(dbname, Options())); { size_t max_merge = 5; @@ -436,7 +524,8 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { testCounters(counters, db.get(), use_compression); testSuccessiveMerge(counters, max_merge, max_merge * 2); testSingleBatchSuccessiveMerge(db.get(), 5, 7); - DestroyDB(dbname, Options()); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); } { @@ -447,14 +536,16 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { auto db = OpenDb(dbname, use_ttl, max_merge); MergeBasedCounters counters(db, 0); testPartialMerge(&counters, db.get(), max_merge, min_merge, count); - DestroyDB(dbname, Options()); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); } { auto db = OpenDb(dbname, use_ttl, max_merge); MergeBasedCounters counters(db, 0); testPartialMerge(&counters, db.get(), max_merge, min_merge, min_merge * 10); - DestroyDB(dbname, Options()); + ASSERT_OK(db->Close()); + ASSERT_OK(DestroyDB(dbname, Options())); } } @@ -465,15 +556,15 @@ void runTest(const std::string& dbname, const bool use_ttl = false) { counters.add("test-key", 1); counters.add("test-key", 1); counters.add("test-key", 1); - db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr)); } DB* reopen_db; ASSERT_OK(DB::Open(Options(), dbname, &reopen_db)); std::string value; - ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok())); + ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value)); delete reopen_db; - DestroyDB(dbname, Options()); + ASSERT_OK(DestroyDB(dbname, Options())); } /* Temporary remove this test @@ -502,6 +593,19 @@ TEST_F(MergeTest, MergeDbTtlTest) { runTest(test::PerThreadDBPath("merge_testdbttl"), true); // Run test on TTL database } + +TEST_F(MergeTest, MergeWithCompactionAndFlush) { + const std::string dbname = + test::PerThreadDBPath("merge_with_compaction_and_flush"); + { + auto db = OpenDb(dbname); + { + MergeBasedCounters counters(db, 0); + testCountersWithFlushAndCompaction(counters, db.get()); + } + } + ASSERT_OK(DestroyDB(dbname, Options())); +} #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 98d98eae242..ee6b0763997 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -28,10 +28,6 @@ #include "test_util/testutil.h" #include "util/string_util.h" -using std::cerr; -using std::cout; -using std::endl; -using std::flush; namespace ROCKSDB_NAMESPACE { @@ -65,7 +61,7 @@ class ObsoleteFilesTest : public DBTestBase { void CheckFileTypeCounts(const std::string& dir, int required_log, int required_sst, int required_manifest) { std::vector filenames; - env_->GetChildren(dir, &filenames); + ASSERT_OK(env_->GetChildren(dir, &filenames)); int log_cnt = 0; int sst_cnt = 0; @@ -74,7 +70,7 @@ class ObsoleteFilesTest : public DBTestBase { uint64_t number; FileType type; if (ParseFileName(file, &number, &type)) { - log_cnt += (type == kLogFile); + log_cnt += (type == kWalFile); sst_cnt += (type == kTableFile); manifest_cnt += (type == kDescriptorFile); } @@ -98,6 +94,12 @@ class ObsoleteFilesTest : public DBTestBase { options.WAL_ttl_seconds = 300; // Used to test log files options.WAL_size_limit_MB = 1024; // Used to test log files options.wal_dir = wal_dir_; + + // Note: the following prevents an otherwise harmless data race between the + // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the + // periodic stat dumping thread. + options.stats_dump_period_sec = 0; + Destroy(options); Reopen(options); } @@ -196,6 +198,8 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { } TEST_F(ObsoleteFilesTest, BlobFiles) { + ReopenDB(); + VersionSet* const versions = dbfull()->TEST_GetVersionSet(); assert(versions); assert(versions->GetColumnFamilySet()); diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 7ad84642779..3ff7e0952b4 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -65,7 +65,7 @@ TEST_F(OptionsFileTest, NumberOfOptionsFiles) { const int kReopenCount = 20; Options opt; opt.create_if_missing = true; - DestroyDB(dbname_, opt); + ASSERT_OK(DestroyDB(dbname_, opt)); std::unordered_set filename_history; DB* db; for (int i = 0; i < kReopenCount; ++i) { diff --git a/db/output_validator.cc b/db/output_validator.cc index 56b8fe59ef8..c36c9281e15 100644 --- a/db/output_validator.cc +++ b/db/output_validator.cc @@ -9,8 +9,8 @@ namespace ROCKSDB_NAMESPACE { Status OutputValidator::Add(const Slice& key, const Slice& value) { if (enable_hash_) { // Generate a rolling 64-bit hash of the key and values - paranoid_hash_ = Hash64(key.data(), key.size(), paranoid_hash_); - paranoid_hash_ = Hash64(value.data(), value.size(), paranoid_hash_); + paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_); + paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_); } if (enable_order_check_) { TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check", diff --git a/db/output_validator.h b/db/output_validator.h index 167b25e0615..ad9000d5e3b 100644 --- a/db/output_validator.h +++ b/db/output_validator.h @@ -17,8 +17,10 @@ namespace ROCKSDB_NAMESPACE { class OutputValidator { public: explicit OutputValidator(const InternalKeyComparator& icmp, - bool enable_order_check, bool enable_hash) + bool enable_order_check, bool enable_hash, + uint64_t precalculated_hash = 0) : icmp_(icmp), + paranoid_hash_(precalculated_hash), enable_order_check_(enable_order_check), enable_hash_(enable_hash) {} @@ -33,9 +35,11 @@ class OutputValidator { return GetHash() == other_validator.GetHash(); } - private: + // Not (yet) intended to be persisted, so subject to change + // without notice between releases. uint64_t GetHash() const { return paranoid_hash_; } + private: const InternalKeyComparator& icmp_; std::string prev_key_; uint64_t paranoid_hash_ = 0; diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 5a714b9b85a..908e684f73e 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // +#include "rocksdb/perf_context.h" + #include #include #include @@ -15,8 +17,8 @@ #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" -#include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "test_util/testharness.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -76,12 +78,12 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { std::string key = "k" + ToString(i); std::string value = "v" + ToString(i); - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); } for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { std::string key = "k" + ToString(i); - db->Delete(write_options, key); + ASSERT_OK(db->Delete(write_options, key)); } HistogramImpl hist_get; @@ -91,7 +93,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { std::string value; get_perf_context()->Reset(); - StopWatchNano timer(Env::Default()); + StopWatchNano timer(SystemClock::Default().get()); timer.Start(); auto status = db->Get(read_options, key, &value); auto elapsed_nanos = timer.ElapsedNanos(); @@ -110,16 +112,15 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { std::unique_ptr iter(db->NewIterator(read_options)); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); iter->SeekToFirst(); hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count); auto elapsed_nanos = timer.ElapsedNanos(); if (FLAGS_verbose) { - std::cout << "SeekToFirst uesr key comparison: \n" - << hist_seek_to_first.ToString() - << "ikey skipped: " << get_perf_context()->internal_key_skipped_count - << "\n" + std::cout << "SeekToFirst user key comparison: \n" + << hist_seek_to_first.ToString() << "ikey skipped: " + << get_perf_context()->internal_key_skipped_count << "\n" << "idelete skipped: " << get_perf_context()->internal_delete_skipped_count << "\n" << "elapsed: " << elapsed_nanos << "\n"; @@ -132,7 +133,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { std::string key = "k" + ToString(i); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); iter->Seek(key); auto elapsed_nanos = timer.ElapsedNanos(); hist_seek.Add(get_perf_context()->user_key_comparison_count); @@ -146,7 +147,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { get_perf_context()->Reset(); ASSERT_TRUE(iter->Valid()); - StopWatchNano timer2(Env::Default(), true); + StopWatchNano timer2(SystemClock::Default().get(), true); iter->Next(); auto elapsed_nanos2 = timer2.ElapsedNanos(); if (FLAGS_verbose) { @@ -156,7 +157,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { } if (FLAGS_verbose) { - std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString(); + std::cout << "Seek user key comparison: \n" << hist_seek.ToString(); } } @@ -165,7 +166,7 @@ TEST_F(PerfContextTest, StopWatchNanoOverhead) { const int kTotalIterations = 1000000; std::vector timings(kTotalIterations); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); for (auto& timing : timings) { timing = timer.ElapsedNanos(true /* reset */); } @@ -186,7 +187,7 @@ TEST_F(PerfContextTest, StopWatchOverhead) { uint64_t elapsed = 0; std::vector timings(kTotalIterations); - StopWatch timer(Env::Default(), nullptr, 0, &elapsed); + StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed); for (auto& timing : timings) { timing = elapsed; } @@ -270,7 +271,7 @@ void ProfileQueries(bool enabled_time = false) { std::vector values; get_perf_context()->Reset(); - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); if (++num_mutex_waited > 3) { #ifndef NDEBUG ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); @@ -314,7 +315,10 @@ void ProfileQueries(bool enabled_time = false) { hist_get.Add(get_perf_context()->user_key_comparison_count); get_perf_context()->Reset(); - db->MultiGet(read_options, multiget_keys, &values); + auto statuses = db->MultiGet(read_options, multiget_keys, &values); + for (const auto& s : statuses) { + ASSERT_OK(s); + } hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); hist_mget_files.Add(get_perf_context()->get_from_output_files_time); @@ -324,9 +328,10 @@ void ProfileQueries(bool enabled_time = false) { } if (FLAGS_verbose) { - std::cout << "Put uesr key comparison: \n" << hist_put.ToString() - << "Get uesr key comparison: \n" << hist_get.ToString() - << "MultiGet uesr key comparison: \n" << hist_get.ToString(); + std::cout << "Put user key comparison: \n" + << hist_put.ToString() << "Get user key comparison: \n" + << hist_get.ToString() << "MultiGet user key comparison: \n" + << hist_get.ToString(); std::cout << "Put(): Pre and Post Process Time: \n" << hist_write_pre_post.ToString() << " Writing WAL time: \n" << hist_write_wal_time.ToString() << "\n" @@ -428,7 +433,10 @@ void ProfileQueries(bool enabled_time = false) { hist_get.Add(get_perf_context()->user_key_comparison_count); get_perf_context()->Reset(); - db->MultiGet(read_options, multiget_keys, &values); + auto statuses = db->MultiGet(read_options, multiget_keys, &values); + for (const auto& s : statuses) { + ASSERT_OK(s); + } hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); hist_mget_files.Add(get_perf_context()->get_from_output_files_time); @@ -438,8 +446,9 @@ void ProfileQueries(bool enabled_time = false) { } if (FLAGS_verbose) { - std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString() - << "ReadOnly MultiGet uesr key comparison: \n" + std::cout << "ReadOnly Get user key comparison: \n" + << hist_get.ToString() + << "ReadOnly MultiGet user key comparison: \n" << hist_mget.ToString(); std::cout << "ReadOnly Get(): Time to get snapshot: \n" @@ -532,14 +541,14 @@ TEST_F(PerfContextTest, SeekKeyComparison) { HistogramImpl hist_time_diff; SetPerfLevel(kEnableTime); - StopWatchNano timer(Env::Default()); + StopWatchNano timer(SystemClock::Default().get()); for (const int i : keys) { std::string key = "k" + ToString(i); std::string value = "v" + ToString(i); get_perf_context()->Reset(); timer.Start(); - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); auto put_time = timer.ElapsedNanos(); hist_put_time.Add(put_time); hist_wal_time.Add(get_perf_context()->write_wal_time); @@ -573,7 +582,7 @@ TEST_F(PerfContextTest, SeekKeyComparison) { iter->Next(); hist_next.Add(get_perf_context()->user_key_comparison_count); } - + ASSERT_OK(iter->status()); if (FLAGS_verbose) { std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n" << hist_next.ToString(); @@ -585,25 +594,26 @@ TEST_F(PerfContextTest, DBMutexLockCounter) { for (PerfLevel perf_level_test : {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) { for (int c = 0; c < 2; ++c) { - InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); - mutex.Lock(); - ROCKSDB_NAMESPACE::port::Thread child_thread([&] { - SetPerfLevel(perf_level_test); - get_perf_context()->Reset(); - ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + InstrumentedMutex mutex(nullptr, SystemClock::Default().get(), + stats_code[c]); mutex.Lock(); - mutex.Unlock(); - if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex || - stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ROCKSDB_NAMESPACE::port::Thread child_thread([&] { + SetPerfLevel(perf_level_test); + get_perf_context()->Reset(); ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); - } else { - // increment the counter only when it's a DB Mutex - ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0); - } - }); - Env::Default()->SleepForMicroseconds(100); - mutex.Unlock(); - child_thread.join(); + mutex.Lock(); + mutex.Unlock(); + if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex || + stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0); + } else { + // increment the counter only when it's a DB Mutex + ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0); + } + }); + SystemClock::Default()->SleepForMicroseconds(100); + mutex.Unlock(); + child_thread.join(); } } } @@ -612,7 +622,8 @@ TEST_F(PerfContextTest, FalseDBMutexWait) { SetPerfLevel(kEnableTime); int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; for (int c = 0; c < 2; ++c) { - InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); + InstrumentedMutex mutex(nullptr, SystemClock::Default().get(), + stats_code[c]); InstrumentedCondVar lock(&mutex); get_perf_context()->Reset(); mutex.Lock(); @@ -817,10 +828,8 @@ TEST_F(PerfContextTest, PerfContextByLevelGetSet) { } TEST_F(PerfContextTest, CPUTimer) { - if (Env::Default()->NowCPUNanos() == 0) { - // TODO: This should be a GTEST_SKIP when the embedded gtest is updated - // to 1.10 or higher. - GTEST_SUCCESS_("Skipped on target without NowCPUNanos support"); + if (SystemClock::Default()->CPUNanos() == 0) { + ROCKSDB_GTEST_SKIP("Target without CPUNanos support"); return; } @@ -837,7 +846,7 @@ TEST_F(PerfContextTest, CPUTimer) { std::string value = "v" + i_str; max_str = max_str > i_str ? max_str : i_str; - db->Put(write_options, key, value); + ASSERT_OK(db->Put(write_options, key, value)); } std::string last_key = "k" + max_str; std::string last_value = "v" + max_str; diff --git a/db/periodic_work_scheduler.cc b/db/periodic_work_scheduler.cc index 121439011ed..677eec90cdd 100644 --- a/db/periodic_work_scheduler.cc +++ b/db/periodic_work_scheduler.cc @@ -6,13 +6,14 @@ #include "db/periodic_work_scheduler.h" #include "db/db_impl/db_impl.h" -#include "util/cast_util.h" +#include "rocksdb/system_clock.h" #ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { -PeriodicWorkScheduler::PeriodicWorkScheduler(Env* env) : timer_mu_(env) { - timer = std::unique_ptr(new Timer(env)); +PeriodicWorkScheduler::PeriodicWorkScheduler( + const std::shared_ptr& clock) { + timer = std::unique_ptr(new Timer(clock.get())); } void PeriodicWorkScheduler::Register(DBImpl* dbi, @@ -53,10 +54,10 @@ void PeriodicWorkScheduler::Unregister(DBImpl* dbi) { } PeriodicWorkScheduler* PeriodicWorkScheduler::Default() { - // Always use the default Env for the scheduler, as we only use the NowMicros - // which is the same for all env. - // The Env could only be overridden in test. - static PeriodicWorkScheduler scheduler(Env::Default()); + // Always use the default SystemClock for the scheduler, as we only use the + // NowMicros which is the same for all clocks. The Env could only be + // overridden in test. + static PeriodicWorkScheduler scheduler(SystemClock::Default()); return &scheduler; } @@ -70,12 +71,13 @@ std::string PeriodicWorkScheduler::GetTaskName(DBImpl* dbi, #ifndef NDEBUG -// Get the static scheduler. For a new env, it needs to re-create the internal -// timer, so only re-create it when there's no running task. Otherwise, return -// the existing scheduler. Which means if the unittest needs to update MockEnv, -// Close all db instances and then re-open them. -PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(Env* env) { - static PeriodicWorkTestScheduler scheduler(env); +// Get the static scheduler. For a new SystemClock, it needs to re-create the +// internal timer, so only re-create it when there's no running task. Otherwise, +// return the existing scheduler. Which means if the unittest needs to update +// MockClock, Close all db instances and then re-open them. +PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default( + const std::shared_ptr& clock) { + static PeriodicWorkTestScheduler scheduler(clock); static port::Mutex mutex; { MutexLock l(&mutex); @@ -85,7 +87,7 @@ PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(Env* env) { MutexLock timer_mu_guard(&scheduler.timer_mu_); scheduler.timer->Shutdown(); } - scheduler.timer.reset(new Timer(env)); + scheduler.timer.reset(new Timer(clock.get())); } } return &scheduler; @@ -105,8 +107,9 @@ size_t PeriodicWorkTestScheduler::TEST_GetValidTaskNum() const { return 0; } -PeriodicWorkTestScheduler::PeriodicWorkTestScheduler(Env* env) - : PeriodicWorkScheduler(env) {} +PeriodicWorkTestScheduler::PeriodicWorkTestScheduler( + const std::shared_ptr& clock) + : PeriodicWorkScheduler(clock) {} #endif // !NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/db/periodic_work_scheduler.h b/db/periodic_work_scheduler.h index 9382adc449b..fe89ff567f7 100644 --- a/db/periodic_work_scheduler.h +++ b/db/periodic_work_scheduler.h @@ -11,6 +11,7 @@ #include "util/timer.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; // PeriodicWorkScheduler is a singleton object, which is scheduling/running // DumpStats(), PersistStats(), and FlushInfoLog() for all DB instances. All DB @@ -49,25 +50,26 @@ class PeriodicWorkScheduler { // the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically. port::Mutex timer_mu_; - explicit PeriodicWorkScheduler(Env* env); + explicit PeriodicWorkScheduler(const std::shared_ptr& clock); private: std::string GetTaskName(DBImpl* dbi, const std::string& func_name); }; #ifndef NDEBUG -// PeriodicWorkTestScheduler is for unittest, which can specify the Env like -// SafeMockTimeEnv. It also contains functions for unittest. +// PeriodicWorkTestScheduler is for unittest, which can specify the SystemClock +// It also contains functions for unittest. class PeriodicWorkTestScheduler : public PeriodicWorkScheduler { public: - static PeriodicWorkTestScheduler* Default(Env* env); + static PeriodicWorkTestScheduler* Default( + const std::shared_ptr& clock); void TEST_WaitForRun(std::function callback) const; size_t TEST_GetValidTaskNum() const; private: - explicit PeriodicWorkTestScheduler(Env* env); + explicit PeriodicWorkTestScheduler(const std::shared_ptr& clock); }; #endif // !NDEBUG diff --git a/db/periodic_work_scheduler_test.cc b/db/periodic_work_scheduler_test.cc index d53265389dd..a92b8730ff8 100644 --- a/db/periodic_work_scheduler_test.cc +++ b/db/periodic_work_scheduler_test.cc @@ -6,6 +6,8 @@ #include "db/periodic_work_scheduler.h" #include "db/db_test_util.h" +#include "env/composite_env_wrapper.h" +#include "test_util/mock_time_env.h" namespace ROCKSDB_NAMESPACE { @@ -13,20 +15,23 @@ namespace ROCKSDB_NAMESPACE { class PeriodicWorkSchedulerTest : public DBTestBase { public: PeriodicWorkSchedulerTest() - : DBTestBase("/periodic_work_scheduler_test", /*env_do_fsync=*/true), - mock_env_(new MockTimeEnv(Env::Default())) {} + : DBTestBase("/periodic_work_scheduler_test", /*env_do_fsync=*/true) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_)); + } protected: - std::unique_ptr mock_env_; + std::unique_ptr mock_env_; + std::shared_ptr mock_clock_; void SetUp() override { - mock_env_->InstallTimedWaitFixCallback(); + mock_clock_->InstallTimedWaitFixCallback(); SyncPoint::GetInstance()->SetCallBack( "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) { auto* periodic_work_scheduler_ptr = reinterpret_cast(arg); *periodic_work_scheduler_ptr = - PeriodicWorkTestScheduler::Default(mock_env_.get()); + PeriodicWorkTestScheduler::Default(mock_clock_); }); } }; @@ -62,7 +67,7 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) { ASSERT_GT(kPeriodSec, 1u); dbfull()->TEST_WaitForStatsDumpRun([&] { - mock_env_->MockSleepForSeconds(static_cast(kPeriodSec) - 1); + mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec) - 1); }); auto scheduler = dbfull()->TEST_GetPeriodicWorkScheduler(); @@ -74,14 +79,14 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) { ASSERT_EQ(1, flush_info_log_counter); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(2, dump_st_counter); ASSERT_EQ(2, pst_st_counter); ASSERT_EQ(2, flush_info_log_counter); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(3, dump_st_counter); ASSERT_EQ(3, pst_st_counter); @@ -95,7 +100,7 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) { // Info log flush should still run. dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(3, dump_st_counter); ASSERT_EQ(3, pst_st_counter); ASSERT_EQ(4, flush_info_log_counter); @@ -113,7 +118,7 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) { ASSERT_EQ(2, scheduler->TEST_GetValidTaskNum()); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(static_cast(kPeriodSec)); }); + [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(4, dump_st_counter); ASSERT_EQ(3, pst_st_counter); ASSERT_EQ(5, flush_info_log_counter); @@ -153,19 +158,19 @@ TEST_F(PeriodicWorkSchedulerTest, MultiInstances) { int expected_run = kInstanceNum; dbi->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); expected_run += kInstanceNum; dbi->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); expected_run += kInstanceNum; dbi->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); @@ -177,14 +182,14 @@ TEST_F(PeriodicWorkSchedulerTest, MultiInstances) { expected_run += (kInstanceNum - half) * 2; dbi->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); dbi->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); for (int i = half; i < kInstanceNum; i++) { - dbs[i]->Close(); + ASSERT_OK(dbs[i]->Close()); delete dbs[i]; } } @@ -201,7 +206,8 @@ TEST_F(PeriodicWorkSchedulerTest, MultiEnv) { Reopen(options1); - std::unique_ptr mock_env2(new MockTimeEnv(Env::Default())); + std::unique_ptr mock_env2( + new CompositeEnvWrapper(Env::Default(), mock_clock_)); Options options2; options2.stats_dump_period_sec = kDumpPeriodSec; options2.stats_persist_period_sec = kPersistPeriodSec; @@ -216,7 +222,7 @@ TEST_F(PeriodicWorkSchedulerTest, MultiEnv) { ASSERT_EQ(dbi->TEST_GetPeriodicWorkScheduler(), dbfull()->TEST_GetPeriodicWorkScheduler()); - db->Close(); + ASSERT_OK(db->Close()); delete db; Close(); } diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index dd428da9bb2..7f5023127a7 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -16,7 +16,6 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" -#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -39,7 +38,6 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" -using std::unique_ptr; namespace ROCKSDB_NAMESPACE { class PlainTableKeyDecoderTest : public testing::Test {}; @@ -51,9 +49,9 @@ TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) { Slice contents(tmp); test::StringSource* string_source = new test::StringSource(contents, 0, false); - + std::unique_ptr holder(string_source); std::unique_ptr file_reader( - test::GetRandomAccessFileReader(string_source)); + new RandomAccessFileReader(std::move(holder), "test")); std::unique_ptr file_info( new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(), kLength)); @@ -264,18 +262,15 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableReader : public PlainTableReader { public: - TestPlainTableReader(const EnvOptions& env_options, - const InternalKeyComparator& icomparator, - EncodingType encoding_type, uint64_t file_size, - int bloom_bits_per_key, double hash_table_ratio, - size_t index_sparseness, - const TableProperties* table_properties, - std::unique_ptr&& file, - const ImmutableCFOptions& ioptions, - const SliceTransform* prefix_extractor, - bool* expect_bloom_not_match, bool store_index_in_file, - uint32_t column_family_id, - const std::string& column_family_name) + TestPlainTableReader( + const EnvOptions& env_options, const InternalKeyComparator& icomparator, + EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, + const TableProperties* table_properties, + std::unique_ptr&& file, + const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor, + bool* expect_bloom_not_match, bool store_index_in_file, + uint32_t column_family_id, const std::string& column_family_name) : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties, prefix_extractor), @@ -397,7 +392,7 @@ class TestPlainTableFactory : public PlainTableFactory { TEST_P(PlainTableDBTest, BadOptions1) { // Build with a prefix extractor ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Bad attempt to re-open without a prefix extractor Options options = CurrentOptions(); @@ -428,7 +423,9 @@ TEST_P(PlainTableDBTest, BadOptions2) { // Build without a prefix extractor // (apparently works even if hash_table_ratio > 0) ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + // Build without a prefix extractor, this call will fail and returns the + // status for this bad attempt. + ASSERT_NOK(dbfull()->TEST_FlushMemTable()); // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor Status s = TryReopen(&options); @@ -503,14 +500,15 @@ TEST_P(PlainTableDBTest, Flush) { ASSERT_OK(Put("1000000000000foo", "v1")); ASSERT_OK(Put("0000000000000bar", "v2")); ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_TRUE(dbfull()->GetIntProperty( "rocksdb.estimate-table-readers-mem", &int_num)); ASSERT_GT(int_num, 0U); TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_OK( + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); ASSERT_EQ(1U, ptc.size()); auto row = ptc.begin(); auto tp = row->second; @@ -595,23 +593,23 @@ TEST_P(PlainTableDBTest, Flush2) { DestroyAndReopen(&options); ASSERT_OK(Put("0000000000000bar", "b")); ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_OK(Put("1000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v2", Get("1000000000000foo")); ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v3", Get("0000000000000eee")); ASSERT_OK(Delete("0000000000000bar")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); ASSERT_OK(Put("0000000000000eee", "v5")); ASSERT_OK(Put("9000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v5", Get("0000000000000eee")); // Test Bloom Filter @@ -651,7 +649,7 @@ TEST_P(PlainTableDBTest, Immortal) { DestroyAndReopen(&options); ASSERT_OK(Put("0000000000000bar", "b")); ASSERT_OK(Put("1000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); int copied = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -729,7 +727,7 @@ TEST_P(PlainTableDBTest, Iterator) { ASSERT_OK(Put("1000000000foo005", "v__5")); ASSERT_OK(Put("1000000000foo007", "v__7")); ASSERT_OK(Put("1000000000foo008", "v__8")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); Iterator* iter = dbfull()->NewIterator(ReadOptions()); @@ -799,7 +797,7 @@ TEST_P(PlainTableDBTest, Iterator) { expect_bloom_not_match = false; } } - + ASSERT_OK(iter->status()); delete iter; } } @@ -840,7 +838,7 @@ TEST_P(PlainTableDBTest, BloomSchema) { for (unsigned i = 0; i < 2345; ++i) { ASSERT_OK(Put(NthKey(i, 'y'), "added")); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("added", Get(NthKey(42, 'y'))); for (unsigned i = 0; i < 32; ++i) { @@ -898,7 +896,7 @@ TEST_P(PlainTableDBTest, IteratorLargeKeys) { ASSERT_OK(Put(key_list[i], ToString(i))); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek(key_list[0]); @@ -946,7 +944,7 @@ TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) { ASSERT_OK(Put(key_list[i], ToString(i))); } - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); Iterator* iter = dbfull()->NewIterator(ReadOptions()); iter->Seek(key_list[0]); @@ -981,7 +979,7 @@ TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) { ASSERT_OK(Put("1000000000foo005", "v__5")); ASSERT_OK(Put("1000000000foo007", "v__7")); ASSERT_OK(Put("1000000000foo008", "v__8")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("1000000000foo001")); ASSERT_EQ("v__3", Get("1000000000foo003")); Iterator* iter = dbfull()->NewIterator(ReadOptions()); @@ -1059,7 +1057,7 @@ TEST_P(PlainTableDBTest, HashBucketConflict) { ASSERT_OK(Put("2000000000000fo2", "v")); ASSERT_OK(Put("2000000000000fo3", "v")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("5000000000000fo0")); ASSERT_EQ("v2", Get("5000000000000fo1")); @@ -1120,6 +1118,7 @@ TEST_P(PlainTableDBTest, HashBucketConflict) { iter->Seek("8000000000000fo2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1153,7 +1152,7 @@ TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { ASSERT_OK(Put("2000000000000fo2", "v")); ASSERT_OK(Put("2000000000000fo3", "v")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("5000000000000fo0")); ASSERT_EQ("v2", Get("5000000000000fo1")); @@ -1213,6 +1212,7 @@ TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { iter->Seek("8000000000000fo2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -1235,7 +1235,7 @@ TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { ASSERT_OK(Put("5000000000000fo1", "v2")); ASSERT_OK(Put("5000000000000fo2", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v1", Get("5000000000000fo0")); ASSERT_EQ("v2", Get("5000000000000fo1")); @@ -1259,6 +1259,7 @@ TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) { iter->Seek("8000000000000fo2"); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } @@ -1286,7 +1287,7 @@ TEST_P(PlainTableDBTest, CompactionTrigger) { ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Put(Key(999), "")); - dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); } @@ -1297,7 +1298,7 @@ TEST_P(PlainTableDBTest, CompactionTrigger) { ASSERT_OK(Put(Key(i), values[i])); } ASSERT_OK(Put(Key(999), "")); - dbfull()->TEST_WaitForCompact(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(1), 1); @@ -1313,7 +1314,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) { ASSERT_OK(Put("1000000000000foo", "v1")); ASSERT_OK(Put("0000000000000bar", "v2")); ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); options.create_if_missing = false; std::shared_ptr block_based_factory( @@ -1329,7 +1330,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) { ASSERT_OK(Put("2000000000000foo", "v4")); ASSERT_OK(Put("3000000000000bar", "v5")); - dbfull()->TEST_FlushMemTable(); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); ASSERT_EQ("v4", Get("2000000000000foo")); ASSERT_EQ("v5", Get("3000000000000bar")); diff --git a/db/prefix_test.cc b/db/prefix_test.cc index d1ec6a2c8a3..37673eb8c8a 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -25,6 +25,7 @@ int main() { #include "rocksdb/memtablerep.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" #include "test_util/testharness.h" #include "util/cast_util.h" @@ -311,7 +312,7 @@ TEST(SamePrefixTest, InDomainTest) { ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006")); ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011")); ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk")); - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); std::string result; auto db_iter = db->NewIterator(ReadOptions()); @@ -331,7 +332,7 @@ TEST(SamePrefixTest, InDomainTest) { ASSERT_OK(db->Put(write_options, "pikachu", "1")); ASSERT_OK(db->Put(write_options, "Meowth", "1")); ASSERT_OK(db->Put(write_options, "Mewtwo", "idk")); - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); std::string result; auto db_iter = db->NewIterator(ReadOptions()); @@ -351,7 +352,7 @@ TEST_F(PrefixTest, TestResult) { std::cout << "*** Mem table: " << options.memtable_factory->Name() << " number of buckets: " << num_buckets << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -528,7 +529,7 @@ TEST_F(PrefixTest, PrefixValid) { while (NextOptions(num_buckets)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() << " number of buckets: " << num_buckets << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -543,7 +544,7 @@ TEST_F(PrefixTest, PrefixValid) { PutKey(db.get(), write_options, 12345, 8, v18); PutKey(db.get(), write_options, 12345, 9, v19); PutKey(db.get(), write_options, 12346, 8, v16); - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); TestKey test_key(12346, 8); std::string s; ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key))); @@ -581,7 +582,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { while (NextOptions(FLAGS_bucket_count)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -597,7 +598,6 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { HistogramImpl hist_put_time; HistogramImpl hist_put_comparison; - // insert x random prefix, each with y continuous element. for (auto prefix : prefixes) { for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { @@ -608,7 +608,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { std::string value(FLAGS_value_size, 0); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); ASSERT_OK(db->Put(write_options, key, value)); hist_put_time.Add(timer.ElapsedNanos()); hist_put_comparison.Add(get_perf_context()->user_key_comparison_count); @@ -631,7 +631,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { std::string value = "v" + ToString(0); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); auto key_prefix = options.prefix_extractor->Transform(key); uint64_t total_keys = 0; for (iter->Seek(key); @@ -665,7 +665,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { Slice key = TestKeyToSlice(s, test_key); get_perf_context()->Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); iter->Seek(key); hist_no_seek_time.Add(timer.ElapsedNanos()); hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count); @@ -689,7 +689,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) { for (size_t m = 1; m < 100; m++) { std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: " << options.memtable_factory->Name() << std::endl; - DestroyDB(kDbName, Options()); + ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; ReadOptions read_options; @@ -714,7 +714,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) { } } if (i < 2) { - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); } } diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 20616c22e58..47599a18fa3 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -33,9 +33,10 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator( if (smallest != nullptr) { pinned_bounds_.emplace_back(); auto& parsed_smallest = pinned_bounds_.back(); - Status pikStatus = ParseInternalKey(smallest->Encode(), &parsed_smallest); - pikStatus.PermitUncheckedError(); - assert(pikStatus.ok()); + Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); smallest_ = &parsed_smallest; } @@ -43,9 +44,10 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator( pinned_bounds_.emplace_back(); auto& parsed_largest = pinned_bounds_.back(); - Status pikStatus = ParseInternalKey(largest->Encode(), &parsed_largest); - pikStatus.PermitUncheckedError(); - assert(pikStatus.ok()); + Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest, + false /* log_err_key */); // TODO + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); if (parsed_largest.type == kTypeRangeDeletion && parsed_largest.sequence == kMaxSequenceNumber) { diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index d5d79d5a81a..8bbee50fb9d 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -43,12 +43,12 @@ class TruncatedRangeDelIterator { void InternalNext(); - // Seeks to the tombstone with the highest viisble sequence number that covers + // Seeks to the tombstone with the highest visible sequence number that covers // target (a user key). If no such tombstone exists, the position will be at // the earliest tombstone that ends after target. void Seek(const Slice& target); - // Seeks to the tombstone with the highest viisble sequence number that covers + // Seeks to the tombstone with the highest visible sequence number that covers // target (a user key). If no such tombstone exists, the position will be at // the latest tombstone that starts before target. void SeekForPrev(const Slice& target); @@ -284,9 +284,10 @@ class RangeDelAggregator { bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) { ParsedInternalKey parsed; - Status pikStatus = ParseInternalKey(key, &parsed); - assert(pikStatus.ok()); - if (!pikStatus.ok()) { + Status pik_status = + ParseInternalKey(key, &parsed, false /* log_err_key */); // TODO + assert(pik_status.ok()); + if (!pik_status.ok()) { return false; } diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 3f3135f2e83..061232f9926 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -11,8 +11,8 @@ int main() { } #else -#include #include +#include #include #include #include @@ -22,14 +22,13 @@ int main() { #include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "rocksdb/comparator.h" -#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" #include "test_util/testutil.h" #include "util/coding.h" +#include "util/gflags_compat.h" #include "util/random.h" #include "util/stop_watch.h" -#include "util/gflags_compat.h" - using GFLAGS_NAMESPACE::ParseCommandLineFlags; DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created"); @@ -172,6 +171,8 @@ int main(int argc, char** argv) { ParseCommandLineFlags(&argc, &argv, true); Stats stats; + ROCKSDB_NAMESPACE::SystemClock* clock = + ROCKSDB_NAMESPACE::SystemClock::Default().get(); ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed); std::default_random_engine random_gen(FLAGS_seed); std::normal_distribution normal_dist(FLAGS_tombstone_width_mean, @@ -220,7 +221,7 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::kMaxSequenceNumber)); ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones( - ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */); + clock, true /* auto_start */); range_del_agg.AddTombstones(std::move(fragmented_range_del_iter)); stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); } @@ -237,7 +238,7 @@ int main(int argc, char** argv) { parsed_key.user_key = key_string; ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete( - ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */); + clock, true /* auto_start */); range_del_agg.ShouldDelete(parsed_key, mode); uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 58426248c3e..0985fee0be3 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -6,12 +6,11 @@ #include "db/range_tombstone_fragmenter.h" #include +#include +#include #include #include -#include -#include - #include "util/autovector.h" #include "util/kv_map.h" #include "util/vector_iterator.h" @@ -26,12 +25,15 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( return; } bool is_sorted = true; - int num_tombstones = 0; InternalKey pinned_last_start_key; Slice last_start_key; + num_unfragmented_tombstones_ = 0; + total_tombstone_payload_bytes_ = 0; for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); - unfragmented_tombstones->Next(), num_tombstones++) { - if (num_tombstones > 0 && + unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); + if (num_unfragmented_tombstones_ > 0 && icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) { is_sorted = false; break; @@ -51,10 +53,14 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( // Sort the tombstones before fragmenting them. std::vector keys, values; - keys.reserve(num_tombstones); - values.reserve(num_tombstones); + keys.reserve(num_unfragmented_tombstones_); + values.reserve(num_unfragmented_tombstones_); + // Reset the counter to zero for the next iteration over keys. + total_tombstone_payload_bytes_ = 0; for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid(); unfragmented_tombstones->Next()) { + total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() + + unfragmented_tombstones->value().size(); keys.emplace_back(unfragmented_tombstones->key().data(), unfragmented_tombstones->key().size()); values.emplace_back(unfragmented_tombstones->value().data(), diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index 63ec24e64f0..f323db5d753 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -68,6 +68,14 @@ struct FragmentedRangeTombstoneList { // number in [lower, upper]. bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const; + uint64_t num_unfragmented_tombstones() const { + return num_unfragmented_tombstones_; + } + + uint64_t total_tombstone_payload_bytes() const { + return total_tombstone_payload_bytes_; + } + private: // Given an ordered range tombstone iterator unfragmented_tombstones, // "fragment" the tombstones into non-overlapping pieces, and store them in @@ -82,6 +90,8 @@ struct FragmentedRangeTombstoneList { std::set seq_set_; std::list pinned_slices_; PinnedIteratorsManager pinned_iters_mgr_; + uint64_t num_unfragmented_tombstones_; + uint64_t total_tombstone_payload_bytes_; }; // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del @@ -180,6 +190,13 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { SequenceNumber upper_bound() const { return upper_bound_; } SequenceNumber lower_bound() const { return lower_bound_; } + uint64_t num_unfragmented_tombstones() const { + return tombstones_->num_unfragmented_tombstones(); + } + uint64_t total_tombstone_payload_bytes() const { + return tombstones_->total_tombstone_payload_bytes(); + } + private: using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; diff --git a/db/repair.cc b/db/repair.cc index 671c105a1dc..1ebd47402bd 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -71,7 +71,6 @@ #include "db/table_cache.h" #include "db/version_edit.h" #include "db/write_batch_internal.h" -#include "env/composite_env_wrapper.h" #include "file/filename.h" #include "file/writable_file_writer.h" #include "options/cf_options.h" @@ -101,8 +100,8 @@ class Repairer { icmp_(default_cf_opts.comparator), default_cf_opts_( SanitizeOptions(immutable_db_options_, default_cf_opts)), - default_cf_iopts_( - ImmutableCFOptions(immutable_db_options_, default_cf_opts_)), + default_iopts_( + ImmutableOptions(immutable_db_options_, default_cf_opts_)), unknown_cf_opts_( SanitizeOptions(immutable_db_options_, unknown_cf_opts)), create_unknown_cfs_(create_unknown_cfs), @@ -110,14 +109,20 @@ class Repairer { // TableCache can be small since we expect each table to be opened // once. NewLRUCache(10, db_options_.table_cache_numshardbits)), - table_cache_(new TableCache( - default_cf_iopts_, env_options_, raw_table_cache_.get(), - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)), + table_cache_( + // TODO: db_session_id for TableCache should be initialized after + // db_session_id_ is set. + new TableCache(default_iopts_, env_options_, raw_table_cache_.get(), + /*block_cache_tracer=*/nullptr, + /*io_tracer=*/nullptr, /*db_session_id*/ "")), wb_(db_options_.db_write_buffer_size), wc_(db_options_.delayed_write_rate), + // TODO: db_session_id for VersionSet should be initialized after + // db_session_id_ is set and use it for initialization. vset_(dbname_, &immutable_db_options_, env_options_, raw_table_cache_.get(), &wb_, &wc_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr), + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ ""), next_file_number_(1), db_lock_(nullptr), closed_(false) { @@ -249,7 +254,7 @@ class Repairer { const ImmutableDBOptions immutable_db_options_; const InternalKeyComparator icmp_; const ColumnFamilyOptions default_cf_opts_; - const ImmutableCFOptions default_cf_iopts_; // table_cache_ holds reference + const ImmutableOptions default_iopts_; // table_cache_ holds reference const ColumnFamilyOptions unknown_cf_opts_; const bool create_unknown_cfs_; std::shared_ptr raw_table_cache_; @@ -312,7 +317,7 @@ class Repairer { if (number + 1 > next_file_number_) { next_file_number_ = number + 1; } - if (type == kLogFile) { + if (type == kWalFile) { logs_.push_back(number); } else if (type == kTableFile) { table_fds_.emplace_back(number, static_cast(path_id), @@ -358,14 +363,14 @@ class Repairer { // Open the log file std::string logname = LogFileName(db_options_.wal_dir, log); - std::unique_ptr lfile; - Status status = env_->NewSequentialFile( - logname, &lfile, env_->OptimizeForLogRead(env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr lfile_reader; + Status status = SequentialFileReader::Create( + fs, logname, fs->OptimizeForLogRead(env_options_), &lfile_reader, + nullptr); if (!status.ok()) { return status; } - std::unique_ptr lfile_reader(new SequentialFileReader( - NewLegacySequentialFileWrapper(lfile), logname)); // Create the log reader. LogReporter reporter; @@ -426,7 +431,8 @@ class Repairer { Arena arena; ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); int64_t _current_time = 0; - status = env_->GetCurrentTime(&_current_time); // ignore error + immutable_db_options_.clock->GetCurrentTime(&_current_time) + .PermitUncheckedError(); // ignore error const uint64_t current_time = static_cast(_current_time); SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); @@ -439,22 +445,25 @@ class Repairer { range_del_iters.emplace_back(range_del_iter); } - LegacyFileSystemWrapper fs(env_); IOStatus io_s; + CompressionOptions default_compression; + TableBuilderOptions tboptions( + *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), + cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), + kNoCompression, default_compression, cfd->GetID(), cfd->GetName(), + -1 /* level */, false /* is_bottommost */, + TableFileCreationReason::kRecovery, current_time, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/, + meta.fd.GetNumber()); status = BuildTable( - dbname_, /* versions */ nullptr, env_, &fs, *cfd->ioptions(), - *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_.get(), - iter.get(), std::move(range_del_iters), &meta, - nullptr /* blob_file_additions */, cfd->internal_comparator(), - cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), - {}, kMaxSequenceNumber, snapshot_checker, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), false, - nullptr /* internal_stats */, TableFileCreationReason::kRecovery, - &io_s, nullptr /*IOTracer*/, nullptr /* event_logger */, - 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, - -1 /* level */, current_time, 0 /* oldest_key_time */, write_hint, - 0 /* file_creation_time */, "DB Repairer" /* db_id */, - db_session_id_); + dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, + env_options_, table_cache_.get(), iter.get(), + std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, + {}, kMaxSequenceNumber, snapshot_checker, + false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, + nullptr /*IOTracer*/, nullptr /* event_logger */, 0 /* job_id */, + Env::IO_HIGH, nullptr /* table_properties */, write_hint); ROCKS_LOG_INFO(db_options_.info_log, "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, meta.fd.GetNumber(), @@ -554,10 +563,12 @@ class Repairer { ParsedInternalKey parsed; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); - if (ParseInternalKey(key, &parsed) != Status::OK()) { + Status pik_status = + ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors); + if (!pik_status.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, - "Table #%" PRIu64 ": unparsable key %s", - t->meta.fd.GetNumber(), EscapeString(key).c_str()); + "Table #%" PRIu64 ": unparsable key - %s", + t->meta.fd.GetNumber(), pik_status.getState()); continue; } diff --git a/db/repair_test.cc b/db/repair_test.cc index 49a96bf8aed..9ea2d9460ec 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -80,8 +80,8 @@ TEST_F(RepairTest, CorruptManifest) { Close(); ASSERT_OK(env_->FileExists(manifest_path)); - LegacyFileSystemWrapper fs(env_); - ASSERT_OK(CreateFile(&fs, manifest_path, "blah", false /* use_fsync */)); + ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah", + false /* use_fsync */)); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); Reopen(CurrentOptions()); @@ -163,8 +163,8 @@ TEST_F(RepairTest, CorruptSst) { ASSERT_OK(GetFirstSstPath(&sst_path)); ASSERT_FALSE(sst_path.empty()); - LegacyFileSystemWrapper fs(env_); - ASSERT_OK(CreateFile(&fs, sst_path, "blah", false /* use_fsync */)); + ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah", + false /* use_fsync */)); Close(); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); @@ -184,7 +184,7 @@ TEST_F(RepairTest, UnflushedSst) { { uint64_t total_ssts_size; std::unordered_map sst_files; - ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); ASSERT_EQ(total_ssts_size, 0); } // Need to get path before Close() deletes db_, but delete it after Close() to @@ -203,7 +203,7 @@ TEST_F(RepairTest, UnflushedSst) { { uint64_t total_ssts_size; std::unordered_map sst_files; - ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); ASSERT_GT(total_ssts_size, 0); } ASSERT_EQ(Get("key"), "val"); @@ -221,7 +221,7 @@ TEST_F(RepairTest, SeparateWalDir) { { uint64_t total_ssts_size; std::unordered_map sst_files; - ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); ASSERT_EQ(total_ssts_size, 0); } std::string manifest_path = @@ -241,7 +241,7 @@ TEST_F(RepairTest, SeparateWalDir) { { uint64_t total_ssts_size; std::unordered_map sst_files; - ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size)); + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); ASSERT_GT(total_ssts_size, 0); } ASSERT_EQ(Get("key"), "val"); diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index 785f814f81d..bfa44e3f53b 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -23,7 +23,7 @@ class SnapshotImpl : public Snapshot { SequenceNumber number_; // const after creation // It indicates the smallest uncommitted data at the time the snapshot was // taken. This is currently used by WritePrepared transactions to limit the - // scope of queries to IsInSnpashot. + // scope of queries to IsInSnapshot. SequenceNumber min_uncommitted_ = kMinUnCommittedSeq; virtual SequenceNumber GetSequenceNumber() const override { return number_; } diff --git a/db/table_cache.cc b/db/table_cache.cc index c6beee18237..2e4d2a58ae4 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -65,17 +65,19 @@ void AppendVarint64(IterKey* key, uint64_t v) { const int kLoadConcurency = 128; -TableCache::TableCache(const ImmutableCFOptions& ioptions, +TableCache::TableCache(const ImmutableOptions& ioptions, const FileOptions& file_options, Cache* const cache, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : ioptions_(ioptions), file_options_(file_options), cache_(cache), immortal_tables_(false), block_cache_tracer_(block_cache_tracer), - loader_mutex_(kLoadConcurency, GetSliceNPHash64), - io_tracer_(io_tracer) { + loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr), + io_tracer_(io_tracer), + db_session_id_(db_session_id) { if (ioptions_.row_cache) { // If the same cache is shared by multiple instances, we need to // disambiguate its entries. @@ -106,38 +108,38 @@ Status TableCache::GetTableReader( TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); std::unique_ptr file; FileOptions fopts = file_options; - Status s = PrepareIOFromReadOptions(ro, ioptions_.env, fopts.io_options); + Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); if (s.ok()) { s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); } - RecordTick(ioptions_.statistics, NO_FILE_OPENS); + RecordTick(ioptions_.stats, NO_FILE_OPENS); if (s.IsPathNotFound()) { fname = Rocks2LevelTableFileName(fname); - s = PrepareIOFromReadOptions(ro, ioptions_.env, fopts.io_options); + s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); if (s.ok()) { s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, nullptr); } - RecordTick(ioptions_.statistics, NO_FILE_OPENS); + RecordTick(ioptions_.stats, NO_FILE_OPENS); } if (s.ok()) { if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(FSRandomAccessFile::kRandom); } - StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); std::unique_ptr file_reader( new RandomAccessFileReader( - std::move(file), fname, ioptions_.env, io_tracer_, - record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter, ioptions_.listeners)); + std::move(file), fname, ioptions_.clock, io_tracer_, + record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, + file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners)); s = ioptions_.table_factory->NewTableReader( ro, - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, - fd.largest_seqno, block_cache_tracer_, - max_file_size_for_l0_meta_pin), + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + skip_filters, immortal_tables_, false /* force_direct_prefetch */, + level, fd.largest_seqno, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, fd.GetNumber()), std::move(file_reader), fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); @@ -161,7 +163,7 @@ Status TableCache::FindTable(const ReadOptions& ro, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin) { - PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env); + PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); *handle = cache_->Lookup(key); @@ -187,7 +189,7 @@ Status TableCache::FindTable(const ReadOptions& ro, max_file_size_for_l0_meta_pin); if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(ioptions_.statistics, NO_FILE_ERRORS); + RecordTick(ioptions_.stats, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { @@ -375,10 +377,10 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, ioptions_.row_cache.get(), row_handle); replayGetContextLog(*found_row_cache_entry, user_key, get_context, &value_pinner); - RecordTick(ioptions_.statistics, ROW_CACHE_HIT); + RecordTick(ioptions_.stats, ROW_CACHE_HIT); found = true; } else { - RecordTick(ioptions_.statistics, ROW_CACHE_MISS); + RecordTick(ioptions_.stats, ROW_CACHE_MISS); } return found; } @@ -502,8 +504,8 @@ Status TableCache::MultiGet(const ReadOptions& options, for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) { - const Slice& user_key = miter->ukey; - ; + const Slice& user_key = miter->ukey_with_ts; + GetContext* get_context = miter->get_context; if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, @@ -539,9 +541,9 @@ Status TableCache::MultiGet(const ReadOptions& options, ++iter) { SequenceNumber* max_covering_tombstone_seq = iter->get_context->max_covering_tombstone_seq(); - *max_covering_tombstone_seq = - std::max(*max_covering_tombstone_seq, - range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey)); + *max_covering_tombstone_seq = std::max( + *max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts)); } } } @@ -566,7 +568,7 @@ Status TableCache::MultiGet(const ReadOptions& options, for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) { std::string& row_cache_entry = row_cache_entries[row_idx++]; - const Slice& user_key = miter->ukey; + const Slice& user_key = miter->ukey_with_ts; ; GetContext* get_context = miter->get_context; diff --git a/db/table_cache.h b/db/table_cache.h index a834683fc17..0c263afe56e 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -10,9 +10,9 @@ // Thread-safe (provides internal synchronization) #pragma once +#include #include #include -#include #include "db/dbformat.h" #include "db/range_del_aggregator.h" @@ -48,10 +48,11 @@ class HistogramImpl; // ioptions.row_cache class TableCache { public: - TableCache(const ImmutableCFOptions& ioptions, + TableCache(const ImmutableOptions& ioptions, const FileOptions& storage_options, Cache* cache, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer); + const std::shared_ptr& io_tracer, + const std::string& db_session_id); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -183,7 +184,7 @@ class TableCache { Cache* get_cache() const { return cache_; } - // Capacity of the backing Cache that indicates inifinite TableCache capacity. + // Capacity of the backing Cache that indicates infinite TableCache capacity. // For example when max_open_files is -1 we set the backing Cache to this. static const int kInfiniteCapacity = 0x400000; @@ -220,7 +221,7 @@ class TableCache { bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, size_t prefix_size, GetContext* get_context); - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const FileOptions& file_options_; Cache* const cache_; std::string row_cache_id_; @@ -228,6 +229,7 @@ class TableCache { BlockCacheTracer* const block_cache_tracer_; Striped loader_mutex_; std::shared_ptr io_tracer_; + std::string db_session_id_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index b43afdc5c42..fdf48c92717 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -33,8 +33,8 @@ Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, const Slice& value, uint64_t file_size) { ParsedInternalKey ikey; - Status s = ParseInternalKey(key, &ikey); - if (s != Status::OK()) { + Status s = ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!s.ok()) { return s; } @@ -43,10 +43,10 @@ Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, } void UserKeyTablePropertiesCollector::BlockAdd( - uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow) { - return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast, - blockCompressedBytesSlow); + uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) { + return collector_->BlockAdd(block_raw_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); } Status UserKeyTablePropertiesCollector::Finish( diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h index 130eb64d480..befb436529a 100644 --- a/db/table_properties_collector.h +++ b/db/table_properties_collector.h @@ -27,9 +27,9 @@ class IntTblPropCollector { virtual Status InternalAdd(const Slice& key, const Slice& value, uint64_t file_size) = 0; - virtual void BlockAdd(uint64_t blockRawBytes, - uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow) = 0; + virtual void BlockAdd(uint64_t block_raw_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) = 0; virtual UserCollectedProperties GetReadableProperties() const = 0; @@ -48,6 +48,13 @@ class IntTblPropCollectorFactory { virtual const char* Name() const = 0; }; +using IntTblPropCollectorFactories = + std::vector>; +using IntTblPropCollectorFactoryIter = + IntTblPropCollectorFactories::const_iterator; +using IntTblPropCollectorFactoryRange = + std::pair; + // When rocksdb creates a new table, it will encode all "user keys" into // "internal keys", which contains meta information of a given entry. // @@ -64,9 +71,9 @@ class UserKeyTablePropertiesCollector : public IntTblPropCollector { virtual Status InternalAdd(const Slice& key, const Slice& value, uint64_t file_size) override; - virtual void BlockAdd(uint64_t blockRawBytes, - uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow) override; + virtual void BlockAdd(uint64_t block_raw_bytes, + uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow) override; virtual Status Finish(UserCollectedProperties* properties) override; diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 56d7edefe96..301302baeba 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -13,7 +13,6 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/sequence_file_reader.h" #include "file/writable_file_writer.h" #include "options/cf_options.h" @@ -42,23 +41,22 @@ namespace { static const uint32_t kTestColumnFamilyId = 66; static const std::string kTestColumnFamilyName = "test_column_fam"; -void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, - const MutableCFOptions& moptions, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - std::unique_ptr* writable, - std::unique_ptr* builder) { - std::unique_ptr wf(new test::StringSink); +void MakeBuilder( + const Options& options, const ImmutableOptions& ioptions, + const MutableCFOptions& moptions, + const InternalKeyComparator& internal_comparator, + const IntTblPropCollectorFactories* int_tbl_prop_collector_factories, + std::unique_ptr* writable, + std::unique_ptr* builder) { + std::unique_ptr wf(new test::StringSink); writable->reset( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)), - "" /* don't care */, EnvOptions())); + new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); int unknown_level = -1; - builder->reset(NewTableBuilder( + TableBuilderOptions tboptions( ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories, - kTestColumnFamilyId, kTestColumnFamilyName, writable->get(), - options.compression, options.sample_for_compression, - options.compression_opts, unknown_level)); + options.compression, options.compression_opts, kTestColumnFamilyId, + kTestColumnFamilyName, unknown_level); + builder->reset(NewTableBuilder(tboptions, writable->get())); } } // namespace @@ -178,9 +176,9 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector { return Status::OK(); } - void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) override { + void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Nothing to do. return; } @@ -264,10 +262,9 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; std::unique_ptr writer; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; if (test_int_tbl_prop_collector) { int_tbl_prop_collector_factories.emplace_back( new RegularKeysStartWithAFactory(backward_mode)); @@ -286,12 +283,13 @@ void TestCustomizedTablePropertiesCollector( writer->Flush(); // -- Step 2: Read properties - LegacyWritableFileWrapper* file = - static_cast(writer->writable_file()); - test::StringSink* fwf = static_cast(file->target()); + test::StringSink* fwf = + static_cast(writer->writable_file()); + std::unique_ptr source( + new test::StringSource(fwf->contents())); std::unique_ptr fake_file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(fwf->contents()))); + new RandomAccessFileReader(std::move(source), "test")); + TableProperties* props; Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), magic_number, ioptions, &props, @@ -396,8 +394,7 @@ void TestInternalKeyPropertiesCollector( Options options; test::PlainInternalKeyComparator pikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; options.table_factory = table_factory; if (sanitized) { options.table_properties_collector_factories.emplace_back( @@ -410,11 +407,11 @@ void TestInternalKeyPropertiesCollector( options.info_log = std::make_shared(); options = SanitizeOptions("db", // just a place holder options); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); options.comparator = comparator; } - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); MutableCFOptions moptions(options); for (int iter = 0; iter < 2; ++iter) { @@ -427,12 +424,13 @@ void TestInternalKeyPropertiesCollector( ASSERT_OK(builder->Finish()); writable->Flush(); - LegacyWritableFileWrapper* file = - static_cast(writable->writable_file()); - test::StringSink* fwf = static_cast(file->target()); + test::StringSink* fwf = + static_cast(writable->writable_file()); + std::unique_ptr source( + new test::StringSource(fwf->contents())); std::unique_ptr reader( - test::GetRandomAccessFileReader( - new test::StringSource(fwf->contents()))); + new RandomAccessFileReader(std::move(source), "test")); + TableProperties* props; Status s = ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index e6180903ff5..ba4c65ff927 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -34,7 +34,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl( io_tracer_(io_tracer) { assert(files_ != nullptr); assert(versions_ != nullptr); - + current_status_.PermitUncheckedError(); // Clear on start reporter_.env = options_->env; reporter_.info_log = options_->info_log.get(); SeekToStartSequence(); // Seek till starting sequence @@ -225,7 +225,8 @@ bool TransactionLogIteratorImpl::IsBatchExpected( void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { std::unique_ptr batch(new WriteBatch()); - WriteBatchInternal::SetContents(batch.get(), record); + Status s = WriteBatchInternal::SetContents(batch.get(), record); + s.PermitUncheckedError(); // TODO: What should we do with this error? SequenceNumber expected_seq = current_last_seq_ + 1; // If the iterator has started, then confirm that we get continuous batches diff --git a/db/version_builder.cc b/db/version_builder.cc index 49c35cf9f2d..474169bda74 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -334,17 +334,23 @@ class VersionBuilder::Rep { TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency1", &pair); #endif if (!level_nonzero_cmp_(f1, f2)) { - return Status::Corruption("L" + NumberToString(level) + - " files are not sorted properly"); + return Status::Corruption( + "L" + NumberToString(level) + + " files are not sorted properly: files #" + + NumberToString(f1->fd.GetNumber()) + ", #" + + NumberToString(f2->fd.GetNumber())); } // Make sure there is no overlap in levels > 0 if (vstorage->InternalComparator()->Compare(f1->largest, f2->smallest) >= 0) { return Status::Corruption( - "L" + NumberToString(level) + " have overlapping ranges " + - (f1->largest).DebugString(true) + " vs. " + - (f2->smallest).DebugString(true)); + "L" + NumberToString(level) + + " have overlapping ranges: file #" + + NumberToString(f1->fd.GetNumber()) + + " largest key: " + (f1->largest).DebugString(true) + + " vs. file #" + NumberToString(f2->fd.GetNumber()) + + " smallest key: " + (f2->smallest).DebugString(true)); } } } @@ -511,6 +517,28 @@ class VersionBuilder::Rep { return meta->oldest_blob_file_number; } + uint64_t GetMinOldestBlobFileNumber() const { + uint64_t min_oldest_blob_file_num = std::numeric_limits::max(); + for (int level = 0; level < num_levels_; ++level) { + const auto& base_files = base_vstorage_->LevelFiles(level); + for (const auto* fmeta : base_files) { + assert(fmeta); + min_oldest_blob_file_num = + std::min(min_oldest_blob_file_num, fmeta->oldest_blob_file_number); + } + const auto& added_files = levels_[level].added_files; + for (const auto& elem : added_files) { + assert(elem.second); + min_oldest_blob_file_num = std::min( + min_oldest_blob_file_num, elem.second->oldest_blob_file_number); + } + } + if (min_oldest_blob_file_num == std::numeric_limits::max()) { + min_oldest_blob_file_num = kInvalidBlobFileNumber; + } + return min_oldest_blob_file_num; + } + Status ApplyFileDeletion(int level, uint64_t file_number) { assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); @@ -828,7 +856,7 @@ class VersionBuilder::Rep { } } - // Save the current state in *v. + // Save the current state in *vstorage. Status SaveTo(VersionStorageInfo* vstorage) { Status s = CheckConsistency(base_vstorage_); if (!s.ok()) { @@ -1046,6 +1074,10 @@ Status VersionBuilder::LoadTableHandlers( is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); } +uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { + return rep_->GetMinOldestBlobFileNumber(); +} + BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( ColumnFamilyData* cfd) : version_builder_(new VersionBuilder( diff --git a/db/version_builder.h b/db/version_builder.h index a4e1c0d63ba..5a5c9ea1807 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -44,6 +44,7 @@ class VersionBuilder { bool is_initial_load, const SliceTransform* prefix_extractor, size_t max_file_size_for_l0_meta_pin); + uint64_t GetMinOldestBlobFileNumber() const; private: class Rep; diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 513900de5bc..26b473f0fe6 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -11,7 +11,6 @@ #include "db/version_edit.h" #include "db/version_set.h" -#include "logging/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -23,7 +22,7 @@ class VersionBuilderTest : public testing::Test { const Comparator* ucmp_; InternalKeyComparator icmp_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; VersionStorageInfo vstorage_; uint32_t file_num_; diff --git a/db/version_edit.cc b/db/version_edit.cc index 8879f0e1bca..8cb173a2dd0 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -89,13 +89,14 @@ void VersionEdit::Clear() { blob_file_additions_.clear(); blob_file_garbages_.clear(); wal_additions_.clear(); - wal_deletions_.clear(); + wal_deletion_.Reset(); column_family_ = 0; is_column_family_add_ = false; is_column_family_drop_ = false; column_family_name_.clear(); is_in_atomic_group_ = false; remaining_entries_ = 0; + full_history_ts_low_.clear(); } bool VersionEdit::EncodeTo(std::string* dst) const { @@ -190,6 +191,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const { char p = static_cast(f.fd.GetPathId()); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } + if (f.temperature != Temperature::kUnknown) { + PutVarint32(dst, NewFileCustomTag::kTemperature); + char p = static_cast(f.temperature); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } if (f.marked_for_compaction) { PutVarint32(dst, NewFileCustomTag::kNeedCompaction); char p = static_cast(1); @@ -225,13 +231,17 @@ bool VersionEdit::EncodeTo(std::string* dst) const { } for (const auto& wal_addition : wal_additions_) { - PutVarint32(dst, kWalAddition); - wal_addition.EncodeTo(dst); + PutVarint32(dst, kWalAddition2); + std::string encoded; + wal_addition.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); } - for (const auto& wal_deletion : wal_deletions_) { - PutVarint32(dst, kWalDeletion); - wal_deletion.EncodeTo(dst); + if (!wal_deletion_.IsEmpty()) { + PutVarint32(dst, kWalDeletion2); + std::string encoded; + wal_deletion_.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); } // 0 is default and does not need to be explicitly written @@ -252,6 +262,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const { PutVarint32(dst, kInAtomicGroup); PutVarint32(dst, remaining_entries_); } + + if (HasFullHistoryTsLow()) { + PutVarint32(dst, kFullHistoryTsLow); + PutLengthPrefixedSlice(dst, full_history_ts_low_); + } return true; } @@ -350,6 +365,16 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "invalid oldest blob file number"; } break; + case kTemperature: + if (field.size() != 1) { + return "temperature field wrong size"; + } else { + Temperature casted_field = static_cast(field[0]); + if (casted_field <= Temperature::kCold) { + f.temperature = casted_field; + } + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -369,6 +394,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); +#ifndef NDEBUG + bool ignore_ignorable_tags = false; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags", + &ignore_ignorable_tags); +#endif Slice input = src; const char* msg = nullptr; uint32_t tag = 0; @@ -379,6 +409,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) { Slice str; InternalKey key; while (msg == nullptr && GetVarint32(&input, &tag)) { +#ifndef NDEBUG + if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) { + tag = kTagSafeIgnoreMask; + } +#endif switch (tag) { case kDbId: if (GetLengthPrefixedSlice(&input, &str)) { @@ -536,7 +571,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; } - case kBlobFileAddition: { + case kBlobFileAddition: + case kBlobFileAddition_DEPRECATED: { BlobFileAddition blob_file_addition; const Status s = blob_file_addition.DecodeFrom(&input); if (!s.ok()) { @@ -547,7 +583,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; } - case kBlobFileGarbage: { + case kBlobFileGarbage: + case kBlobFileGarbage_DEPRECATED: { BlobFileGarbage blob_file_garbage; const Status s = blob_file_garbage.DecodeFrom(&input); if (!s.ok()) { @@ -569,6 +606,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; } + case kWalAddition2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalAddition not prefixed by length"; + break; + } + + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + case kWalDeletion: { WalDeletion wal_deletion; const Status s = wal_deletion.DecodeFrom(&input); @@ -576,7 +630,24 @@ Status VersionEdit::DecodeFrom(const Slice& src) { return s; } - wal_deletions_.emplace_back(std::move(wal_deletion)); + wal_deletion_ = std::move(wal_deletion); + break; + } + + case kWalDeletion2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalDeletion not prefixed by length"; + break; + } + + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); break; } @@ -612,6 +683,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; + case kFullHistoryTsLow: + if (!GetLengthPrefixedSlice(&input, &str)) { + msg = "full_history_ts_low"; + } else if (str.empty()) { + msg = "full_history_ts_low: empty"; + } else { + full_history_ts_low_.assign(str.data(), str.size()); + } + break; + default: if (tag & kTagSafeIgnoreMask) { // Tag from future which can be safely ignored. @@ -708,6 +789,12 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append(f.file_checksum); r.append(" file_checksum_func_name: "); r.append(f.file_checksum_func_name); + if (f.temperature != Temperature::kUnknown) { + r.append(" temperature: "); + // Maybe change to human readable format whenthe feature becomes + // permanent + r.append(ToString(static_cast(f.temperature))); + } } for (const auto& blob_file_addition : blob_file_additions_) { @@ -725,9 +812,9 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append(wal_addition.DebugString()); } - for (const auto& wal_deletion : wal_deletions_) { + if (!wal_deletion_.IsEmpty()) { r.append("\n WalDeletion: "); - r.append(wal_deletion.DebugString()); + r.append(wal_deletion_.DebugString()); } r.append("\n ColumnFamily: "); @@ -744,6 +831,10 @@ std::string VersionEdit::DebugString(bool hex_key) const { AppendNumberTo(&r, remaining_entries_); r.append(" entries remains"); } + if (HasFullHistoryTsLow()) { + r.append("\n FullHistoryTsLow: "); + r.append(Slice(full_history_ts_low_).ToString(hex_key)); + } r.append("\n}\n"); return r; } @@ -806,6 +897,11 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { jw << "OldestBlobFile" << f.oldest_blob_file_number; } + if (f.temperature != Temperature::kUnknown) { + // Maybe change to human readable format whenthe feature becomes + // permanent + jw << "Temperature" << static_cast(f.temperature); + } jw.EndArrayedObject(); } @@ -854,18 +950,11 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw.EndArray(); } - if (!wal_deletions_.empty()) { - jw << "WalDeletions"; - - jw.StartArray(); - - for (const auto& wal_deletion : wal_deletions_) { - jw.StartArrayedObject(); - jw << wal_deletion; - jw.EndArrayedObject(); - } - - jw.EndArray(); + if (!wal_deletion_.IsEmpty()) { + jw << "WalDeletion"; + jw.StartObject(); + jw << wal_deletion_; + jw.EndObject(); } jw << "ColumnFamily" << column_family_; @@ -880,6 +969,10 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw << "AtomicGroup" << remaining_entries_; } + if (HasFullHistoryTsLow()) { + jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key); + } + jw.EndObject(); return jw.Get(); diff --git a/db/version_edit.h b/db/version_edit.h index b4ab74741a6..ce3283f1422 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -52,15 +52,21 @@ enum Tag : uint32_t { kInAtomicGroup = 300, + kBlobFileAddition = 400, + kBlobFileGarbage, + // Mask for an unidentified tag from the future which can be safely ignored. kTagSafeIgnoreMask = 1 << 13, // Forward compatible (aka ignorable) records kDbId, - kBlobFileAddition, - kBlobFileGarbage, + kBlobFileAddition_DEPRECATED, + kBlobFileGarbage_DEPRECATED, kWalAddition, kWalDeletion, + kFullHistoryTsLow, + kWalAddition2, + kWalDeletion2, }; enum NewFileCustomTag : uint32_t { @@ -68,13 +74,14 @@ enum NewFileCustomTag : uint32_t { kNeedCompaction = 2, // Since Manifest is not entirely forward-compatible, we currently encode // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed - // when manifest becomes forward-comptabile. + // when manifest becomes forward-compatible. kMinLogNumberToKeepHack = 3, kOldestBlobFileNumber = 4, kOldestAncesterTime = 5, kFileCreationTime = 6, kFileChecksum = 7, kFileChecksumFuncName = 8, + kTemperature = 9, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -182,6 +189,7 @@ struct FileMetaData { bool marked_for_compaction = false; // True if client asked us nicely to // compact this file. + Temperature temperature = Temperature::kUnknown; // Used only in BlobDB. The file number of the oldest blob file this SST file // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. @@ -189,7 +197,7 @@ struct FileMetaData { // The file could be the compaction output from other SST files, which could // in turn be outputs for compact older SST files. We track the memtable - // flush timestamp for the oldest SST file that eventaully contribute data + // flush timestamp for the oldest SST file that eventually contribute data // to this file. 0 means the information is not available. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; @@ -425,6 +433,7 @@ class VersionEdit { } void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) { + assert(blob_file_additions_.empty()); blob_file_additions_ = std::move(blob_file_additions); } @@ -448,32 +457,44 @@ class VersionEdit { } void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) { + assert(blob_file_garbages_.empty()); blob_file_garbages_ = std::move(blob_file_garbages); } // Add a WAL (either just created or closed). + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) { + assert(NumEntries() == wal_additions_.size()); wal_additions_.emplace_back(number, std::move(metadata)); } // Retrieve all the added WALs. const WalAdditions& GetWalAdditions() const { return wal_additions_; } - bool HasWalAddition() const { return !wal_additions_.empty(); } + bool IsWalAddition() const { return !wal_additions_.empty(); } // Delete a WAL (either directly deleted or archived). - void DeleteWal(WalNumber number) { wal_deletions_.emplace_back(number); } + // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit. + void DeleteWalsBefore(WalNumber number) { + assert((NumEntries() == 1) == !wal_deletion_.IsEmpty()); + wal_deletion_ = WalDeletion(number); + } + + const WalDeletion& GetWalDeletion() const { return wal_deletion_; } - // Retrieve all the deleted WALs. - const WalDeletions& GetWalDeletions() const { return wal_deletions_; } + bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); } - bool HasWalDeletion() const { return !wal_deletions_.empty(); } + bool IsWalManipulation() const { + size_t entries = NumEntries(); + return (entries > 0) && ((entries == wal_additions_.size()) || + (entries == !wal_deletion_.IsEmpty())); + } // Number of edits size_t NumEntries() const { return new_files_.size() + deleted_files_.size() + blob_file_additions_.size() + blob_file_garbages_.size() + - wal_additions_.size() + wal_deletions_.size(); + wal_additions_.size() + !wal_deletion_.IsEmpty(); } void SetColumnFamily(uint32_t column_family_id) { @@ -513,6 +534,16 @@ class VersionEdit { bool IsInAtomicGroup() const { return is_in_atomic_group_; } uint32_t GetRemainingEntries() const { return remaining_entries_; } + bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); } + const std::string& GetFullHistoryTsLow() const { + assert(HasFullHistoryTsLow()); + return full_history_ts_low_; + } + void SetFullHistoryTsLow(std::string full_history_ts_low) { + assert(!full_history_ts_low.empty()); + full_history_ts_low_ = std::move(full_history_ts_low); + } + // return true on success. bool EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); @@ -529,8 +560,11 @@ class VersionEdit { private: friend class ReactiveVersionSet; + friend class VersionEditHandlerBase; + friend class ListColumnFamiliesHandler; friend class VersionEditHandler; friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; friend class VersionSet; friend class Version; friend class AtomicGroupReadBuffer; @@ -565,7 +599,7 @@ class VersionEdit { BlobFileGarbages blob_file_garbages_; WalAdditions wal_additions_; - WalDeletions wal_deletions_; + WalDeletion wal_deletion_; // Each version edit record should have column_family_ set // If it's not set, it is default (0) @@ -579,6 +613,8 @@ class VersionEdit { bool is_in_atomic_group_ = false; uint32_t remaining_entries_ = 0; + + std::string full_history_ts_low_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 75fe107c58f..7a2996a59e2 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -9,28 +9,16 @@ #include "db/version_edit_handler.h" +#include + +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" #include "monitoring/persistent_stats_history.h" namespace ROCKSDB_NAMESPACE { -VersionEditHandler::VersionEditHandler( - bool read_only, const std::vector& column_families, - VersionSet* version_set, bool track_missing_files, - bool no_error_if_table_files_missing, - const std::shared_ptr& io_tracer) - : read_only_(read_only), - column_families_(column_families), - status_(), - version_set_(version_set), - track_missing_files_(track_missing_files), - no_error_if_table_files_missing_(no_error_if_table_files_missing), - initialized_(false), - io_tracer_(io_tracer) { - assert(version_set_ != nullptr); -} - -void VersionEditHandler::Iterate(log::Reader& reader, Status* log_read_status, - std::string* db_id) { +void VersionEditHandlerBase::Iterate(log::Reader& reader, + Status* log_read_status) { Slice record; std::string scratch; assert(log_read_status); @@ -38,19 +26,14 @@ void VersionEditHandler::Iterate(log::Reader& reader, Status* log_read_status, size_t recovered_edits = 0; Status s = Initialize(); - while (s.ok() && reader.ReadRecord(&record, &scratch) && - log_read_status->ok()) { + while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() && + reader.ReadRecord(&record, &scratch) && log_read_status->ok()) { VersionEdit edit; s = edit.DecodeFrom(record); if (!s.ok()) { break; } - if (edit.has_db_id_) { - version_set_->db_id_ = edit.GetDbId(); - if (db_id != nullptr) { - *db_id = version_set_->db_id_; - } - } + s = read_buffer_.AddEdit(&edit); if (!s.ok()) { break; @@ -86,6 +69,80 @@ void VersionEditHandler::Iterate(log::Reader& reader, Status* log_read_status, if (!s.ok()) { status_ = s; } + TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish", + &recovered_edits); +} + +Status ListColumnFamiliesHandler::ApplyVersionEdit( + VersionEdit& edit, ColumnFamilyData** /*unused*/) { + Status s; + if (edit.is_column_family_add_) { + if (column_family_names_.find(edit.column_family_) != + column_family_names_.end()) { + s = Status::Corruption("Manifest adding the same column family twice"); + } else { + column_family_names_.insert( + {edit.column_family_, edit.column_family_name_}); + } + } else if (edit.is_column_family_drop_) { + if (column_family_names_.find(edit.column_family_) == + column_family_names_.end()) { + s = Status::Corruption("Manifest - dropping non-existing column family"); + } else { + column_family_names_.erase(edit.column_family_); + } + } + return s; +} + +Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) { + for (const auto& deleted_file : edit.GetDeletedFiles()) { + Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second); + if (!s.ok()) { + return s; + } + } + for (const auto& new_file : edit.GetNewFiles()) { + Status s = file_checksum_list_.InsertOneFileChecksum( + new_file.second.fd.GetNumber(), new_file.second.file_checksum, + new_file.second.file_checksum_func_name); + if (!s.ok()) { + return s; + } + } + for (const auto& new_blob_file : edit.GetBlobFileAdditions()) { + std::string checksum_value = new_blob_file.GetChecksumValue(); + std::string checksum_method = new_blob_file.GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (checksum_method.empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + Status s = file_checksum_list_.InsertOneFileChecksum( + new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method); + if (!s.ok()) { + return s; + } + } + return Status::OK(); +} + +VersionEditHandler::VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + bool skip_load_table_files) + : VersionEditHandlerBase(), + read_only_(read_only), + column_families_(std::move(column_families)), + version_set_(version_set), + track_missing_files_(track_missing_files), + no_error_if_files_missing_(no_error_if_files_missing), + io_tracer_(io_tracer), + skip_load_table_files_(skip_load_table_files), + initialized_(false) { + assert(version_set_ != nullptr); } Status VersionEditHandler::Initialize() { @@ -121,6 +178,10 @@ Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit, s = OnColumnFamilyAdd(edit, cfd); } else if (edit.is_column_family_drop_) { s = OnColumnFamilyDrop(edit, cfd); + } else if (edit.IsWalAddition()) { + s = OnWalAddition(edit); + } else if (edit.IsWalDeletion()) { + s = OnWalDeletion(edit); } else { s = OnNonCfOperation(edit, cfd); } @@ -190,6 +251,17 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, return s; } +Status VersionEditHandler::OnWalAddition(VersionEdit& edit) { + assert(edit.IsWalAddition()); + return version_set_->wals_.AddWals(edit.GetWalAdditions()); +} + +Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) { + assert(edit.IsWalDeletion()); + return version_set_->wals_.DeleteWalsBefore( + edit.GetWalDeletion().GetLogNumber()); +} + Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd) { bool cf_in_not_found = false; @@ -231,6 +303,14 @@ bool VersionEditHandler::HasMissingFiles() const { break; } } + if (!ret) { + for (const auto& elem : cf_to_missing_blob_files_high_) { + if (elem.second != kInvalidBlobFileNumber) { + ret = true; + break; + } + } + } return ret; } @@ -259,7 +339,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, Status* s) { assert(s != nullptr); if (!s->ok()) { - read_buffer_.Clear(); + // Do nothing here. } else if (!version_edit_params_.has_log_number_ || !version_edit_params_.has_next_file_number_ || !version_edit_params_.has_last_sequence_) { @@ -277,7 +357,10 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, msg.append(" entry in MANIFEST"); *s = Status::Corruption(msg); } - if (s->ok() && !read_only_ && !column_families_not_found_.empty()) { + // There were some column families in the MANIFEST that weren't specified + // in the argument. This is OK in read_only mode + if (s->ok() && MustOpenAllColumnFamilies() && + !column_families_not_found_.empty()) { std::string msg; for (const auto& cf : column_families_not_found_) { msg.append(", "); @@ -294,6 +377,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_); version_set_->MarkFileNumberUsed(version_edit_params_.log_number_); for (auto* cfd : *(version_set_->GetColumnFamilySet())) { + if (cfd->IsDropped()) { + continue; + } auto builder_iter = builders_.find(cfd->GetID()); assert(builder_iter != builders_.end()); auto* builder = builder_iter->second->version_builder(); @@ -315,6 +401,10 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false, /*is_initial_load=*/true); if (!s->ok()) { + // If s is IOError::PathNotFound, then we mark the db as corrupted. + if (s->IsPathNotFound()) { + *s = Status::Corruption("Corruption: " + s->ToString()); + } break; } } @@ -357,6 +447,8 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit( if (track_missing_files_) { cf_to_missing_files_.emplace(edit.column_family_, std::unordered_set()); + cf_to_missing_blob_files_high_.emplace(edit.column_family_, + kInvalidBlobFileNumber); } return cfd; } @@ -370,15 +462,19 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_); assert(missing_files_iter != cf_to_missing_files_.end()); cf_to_missing_files_.erase(missing_files_iter); + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(edit.column_family_); + assert(missing_blob_files_high_iter != + cf_to_missing_blob_files_high_.end()); + cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter); } ColumnFamilyData* ret = version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_); assert(ret != nullptr); - if (ret->UnrefAndTryDelete()) { - ret = nullptr; - } else { - assert(false); - } + ret->SetDropped(); + ret->UnrefAndTryDelete(); + ret = nullptr; return ret; } @@ -411,6 +507,9 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, bool prefetch_index_and_filter_in_cache, bool is_initial_load) { + if (skip_load_table_files_) { + return Status::OK(); + } assert(cfd != nullptr); assert(!cfd->IsDropped()); auto builder_iter = builders_.find(cfd->GetID()); @@ -424,8 +523,7 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, prefetch_index_and_filter_in_cache, is_initial_load, cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); - if ((s.IsPathNotFound() || s.IsCorruption()) && - no_error_if_table_files_missing_) { + if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } if (!s.ok() && !version_set_->db_options_->paranoid_checks) { @@ -437,10 +535,11 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, const VersionEdit& edit) { Status s; + if (edit.has_db_id_) { + version_set_->db_id_ = edit.GetDbId(); + version_edit_params_.SetDBId(edit.db_id_); + } if (cfd != nullptr) { - if (edit.has_db_id_) { - version_edit_params_.SetDBId(edit.db_id_); - } if (edit.has_log_number_) { if (cfd->GetLogNumber() > edit.log_number_) { ROCKS_LOG_WARN( @@ -454,9 +553,17 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, } if (edit.has_comparator_ && edit.comparator_ != cfd->user_comparator()->Name()) { - s = Status::InvalidArgument( - cfd->user_comparator()->Name(), - "does not match existing comparator " + edit.comparator_); + if (!cf_to_cmp_names_) { + s = Status::InvalidArgument( + cfd->user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + } else { + cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_); + } + } + if (edit.HasFullHistoryTsLow()) { + const std::string& new_ts = edit.GetFullHistoryTsLow(); + cfd->SetFullHistoryTsLow(new_ts); } } @@ -486,12 +593,11 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, } VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( - bool read_only, const std::vector& column_families, + bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer) : VersionEditHandler(read_only, column_families, version_set, /*track_missing_files=*/true, - /*no_error_if_table_files_missing=*/true, io_tracer), - io_tracer_(io_tracer) {} + /*no_error_if_files_missing=*/true, io_tracer) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& elem : versions_) { @@ -541,7 +647,29 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID()); assert(missing_files_iter != cf_to_missing_files_.end()); std::unordered_set& missing_files = missing_files_iter->second; - const bool prev_has_missing_files = !missing_files.empty(); + + auto missing_blob_files_high_iter = + cf_to_missing_blob_files_high_.find(cfd->GetID()); + assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end()); + const uint64_t prev_missing_blob_file_high = + missing_blob_files_high_iter->second; + + VersionBuilder* builder = nullptr; + + if (prev_missing_blob_file_high != kInvalidBlobFileNumber) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + } + + // At this point, we have not yet applied the new version edits read from the + // MANIFEST. We check whether we have any missing table and blob files. + const bool prev_has_missing_files = + !missing_files.empty() || + (prev_missing_blob_file_high != kInvalidBlobFileNumber && + prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber()); + for (const auto& file : edit.GetDeletedFiles()) { uint64_t file_num = file.second; auto fiter = missing_files.find(file_num); @@ -549,6 +677,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( missing_files.erase(fiter); } } + + assert(!cfd->ioptions()->cf_paths.empty()); Status s; for (const auto& elem : edit.GetNewFiles()) { const FileMetaData& meta = elem.second; @@ -556,7 +686,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( uint64_t file_num = fd.GetNumber(); const std::string fpath = MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); - s = version_set_->VerifyFileMetadata(fpath, meta); + s = VerifyFile(fpath, meta); if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { missing_files.insert(file_num); s = Status::OK(); @@ -564,17 +694,60 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( break; } } + + uint64_t missing_blob_file_num = prev_missing_blob_file_high; + for (const auto& elem : edit.GetBlobFileAdditions()) { + uint64_t file_num = elem.GetBlobFileNumber(); + s = VerifyBlobFile(cfd, file_num, elem); + if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { + missing_blob_file_num = std::max(missing_blob_file_num, file_num); + s = Status::OK(); + } else if (!s.ok()) { + break; + } + } + + bool has_missing_blob_files = false; + if (missing_blob_file_num != kInvalidBlobFileNumber && + missing_blob_file_num >= prev_missing_blob_file_high) { + missing_blob_files_high_iter->second = missing_blob_file_num; + has_missing_blob_files = true; + } else if (missing_blob_file_num < prev_missing_blob_file_high) { + assert(false); + } + + // We still have not applied the new version edit, but have tried to add new + // table and blob files after verifying their presence and consistency. + // Therefore, we know whether we will see new missing table and blob files + // later after actually applying the version edit. We perform the check here + // and record the result. + const bool has_missing_files = + !missing_files.empty() || has_missing_blob_files; + bool missing_info = !version_edit_params_.has_log_number_ || !version_edit_params_.has_next_file_number_ || !version_edit_params_.has_last_sequence_; - // Create version before apply edit + // Create version before apply edit. The version will represent the state + // before applying the version edit. + // A new version will created if: + // 1) no error has occurred so far, and + // 2) log_number_, next_file_number_ and last_sequence_ are known, and + // 3) any of the following: + // a) no missing file before, but will have missing file(s) after applying + // this version edit. + // b) no missing file after applying the version edit, and the caller + // explicitly request that a new version be created. if (s.ok() && !missing_info && - ((!missing_files.empty() && !prev_has_missing_files) || - (missing_files.empty() && force_create_version))) { - auto builder_iter = builders_.find(cfd->GetID()); - assert(builder_iter != builders_.end()); - auto* builder = builder_iter->second->version_builder(); + ((has_missing_files && !prev_has_missing_files) || + (!has_missing_files && force_create_version))) { + if (!builder) { + auto builder_iter = builders_.find(cfd->GetID()); + assert(builder_iter != builders_.end()); + builder = builder_iter->second->version_builder(); + assert(builder); + } + auto* version = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, version_set_->current_version_number_++); @@ -597,4 +770,159 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( return s; } +Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath, + const FileMetaData& fmeta) { + return version_set_->VerifyFileMetadata(fpath, fmeta); +} + +Status VersionEditHandlerPointInTime::VerifyBlobFile( + ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition) { + BlobFileCache* blob_file_cache = cfd->blob_file_cache(); + assert(blob_file_cache); + CacheHandleGuard blob_file_reader; + Status s = + blob_file_cache->GetBlobFileReader(blob_file_num, &blob_file_reader); + if (!s.ok()) { + return s; + } + // TODO: verify checksum + (void)blob_addition; + return s; +} + +Status ManifestTailer::Initialize() { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::Initialize(); + } + assert(Mode::kCatchUp == mode_); + Status s; + if (!initialized_) { + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* default_cfd = cfd_set->GetDefault(); + assert(default_cfd); + auto builder_iter = builders_.find(default_cfd->GetID()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = default_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(default_cfd, base_version)); + builder_iter->second = std::move(new_builder); + + initialized_ = true; + } + return s; +} + +Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) { + Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd); + if (s.ok()) { + assert(cfd); + if (*cfd) { + cfds_changed_.insert(*cfd); + } + } + return s; +} + +Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit, + ColumnFamilyData** cfd) { + if (Mode::kRecovery == mode_) { + return VersionEditHandler::OnColumnFamilyAdd(edit, cfd); + } + assert(Mode::kCatchUp == mode_); + ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet(); + assert(cfd_set); + ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily()); + assert(cfd); + *cfd = tmp_cfd; + if (!tmp_cfd) { + // For now, ignore new column families created after Recover() succeeds. + return Status::OK(); + } + auto builder_iter = builders_.find(edit.GetColumnFamily()); + assert(builder_iter != builders_.end()); + + Version* dummy_version = tmp_cfd->dummy_versions(); + assert(dummy_version); + Version* base_version = dummy_version->Next(); + assert(base_version); + base_version->Ref(); + VersionBuilderUPtr new_builder( + new BaseReferencedVersionBuilder(tmp_cfd, base_version)); + builder_iter->second = std::move(new_builder); + +#ifndef NDEBUG + auto version_iter = versions_.find(edit.GetColumnFamily()); + assert(version_iter != versions_.end()); +#endif // !NDEBUG + return Status::OK(); +} + +void ManifestTailer::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandlerPointInTime::CheckIterationResult(reader, s); + assert(s); + if (s->ok()) { + if (Mode::kRecovery == mode_) { + mode_ = Mode::kCatchUp; + } else { + assert(Mode::kCatchUp == mode_); + } + } +} + +Status ManifestTailer::VerifyFile(const std::string& fpath, + const FileMetaData& fmeta) { + Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta); + // TODO: Open file or create hard link to prevent the file from being + // deleted. + return s; +} + +void DumpManifestHandler::CheckIterationResult(const log::Reader& reader, + Status* s) { + VersionEditHandler::CheckIterationResult(reader, s); + if (!s->ok()) { + fprintf(stdout, "%s\n", s->ToString().c_str()); + return; + } + assert(cf_to_cmp_names_); + for (auto* cfd : *(version_set_->column_family_set_)) { + fprintf(stdout, + "--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber()); + auto it = cf_to_cmp_names_->find(cfd->GetID()); + if (it != cf_to_cmp_names_->end()) { + fprintf(stdout, + "comparator: <%s>, but the comparator object is not available.\n", + it->second.c_str()); + } else { + fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name()); + } + assert(cfd->current()); + + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char), + cfd->current()->DebugString(hex_).size(), stdout); + } + fprintf(stdout, + "next_file_number %" PRIu64 " last_sequence %" PRIu64 + " prev_log_number %" PRIu64 " max_column_family %" PRIu32 + " min_log_number_to_keep " + "%" PRIu64 "\n", + version_set_->current_next_file_number(), + version_set_->LastSequence(), version_set_->prev_log_number(), + version_set_->column_family_set_->GetMaxColumnFamily(), + version_set_->min_log_number_to_keep_2pc()); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index da222a8f3db..665e0f0d43f 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -15,7 +15,79 @@ namespace ROCKSDB_NAMESPACE { -typedef std::unique_ptr VersionBuilderUPtr; +struct FileMetaData; + +class VersionEditHandlerBase { + public: + explicit VersionEditHandlerBase() + : max_manifest_read_size_(std::numeric_limits::max()) {} + + virtual ~VersionEditHandlerBase() {} + + void Iterate(log::Reader& reader, Status* log_read_status); + + const Status& status() const { return status_; } + + AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } + + protected: + explicit VersionEditHandlerBase(uint64_t max_read_size) + : max_manifest_read_size_(max_read_size) {} + virtual Status Initialize() { return Status::OK(); } + + virtual Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** cfd) = 0; + + virtual void CheckIterationResult(const log::Reader& /*reader*/, + Status* /*s*/) {} + + void ClearReadBuffer() { read_buffer_.Clear(); } + + Status status_; + + private: + AtomicGroupReadBuffer read_buffer_; + const uint64_t max_manifest_read_size_; +}; + +class ListColumnFamiliesHandler : public VersionEditHandlerBase { + public: + ListColumnFamiliesHandler() : VersionEditHandlerBase() {} + + ~ListColumnFamiliesHandler() override {} + + const std::map GetColumnFamilyNames() const { + return column_family_names_; + } + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + // default column family is always implicitly there + std::map column_family_names_{ + {0, kDefaultColumnFamilyName}}; +}; + +class FileChecksumRetriever : public VersionEditHandlerBase { + public: + FileChecksumRetriever(uint64_t max_read_size, + FileChecksumList& file_checksum_list) + : VersionEditHandlerBase(max_read_size), + file_checksum_list_(file_checksum_list) {} + + ~FileChecksumRetriever() override {} + + protected: + Status ApplyVersionEdit(VersionEdit& edit, + ColumnFamilyData** /*unused*/) override; + + private: + FileChecksumList& file_checksum_list_; +}; + +using VersionBuilderUPtr = std::unique_ptr; // A class used for scanning MANIFEST file. // VersionEditHandler reads a MANIFEST file, parses the version edits, and @@ -24,44 +96,64 @@ typedef std::unique_ptr VersionBuilderUPtr; // To use this class and its subclasses, // 1. Create an object of VersionEditHandler or its subclasses. // VersionEditHandler handler(read_only, column_families, version_set, -// track_missing_files, ignore_missing_files); +// track_missing_files, +// no_error_if_files_missing); // 2. Status s = handler.Iterate(reader, &db_id); // 3. Check s and handle possible errors. // // Not thread-safe, external synchronization is necessary if an object of // VersionEditHandler is shared by multiple threads. -class VersionEditHandler { +class VersionEditHandler : public VersionEditHandlerBase { public: explicit VersionEditHandler( bool read_only, const std::vector& column_families, VersionSet* version_set, bool track_missing_files, - bool ignore_missing_files, const std::shared_ptr& io_tracer); + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer) + : VersionEditHandler(read_only, column_families, version_set, + track_missing_files, no_error_if_files_missing, + io_tracer, /*skip_load_table_files=*/false) {} - virtual ~VersionEditHandler() {} + ~VersionEditHandler() override {} - void Iterate(log::Reader& reader, Status* log_read_status, - std::string* db_id); - - const Status& status() const { return status_; } + const VersionEditParams& GetVersionEditParams() const { + return version_edit_params_; + } bool HasMissingFiles() const; + void GetDbId(std::string* db_id) const { + if (db_id && version_edit_params_.has_db_id_) { + *db_id = version_edit_params_.db_id_; + } + } + protected: - Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd); + explicit VersionEditHandler( + bool read_only, std::vector column_families, + VersionSet* version_set, bool track_missing_files, + bool no_error_if_files_missing, + const std::shared_ptr& io_tracer, bool skip_load_table_files); - Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd); + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd); Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd); Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd); - Status Initialize(); + Status OnWalAddition(VersionEdit& edit); + + Status OnWalDeletion(VersionEdit& edit); + + Status Initialize() override; void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found, bool* cf_in_builders) const; - virtual void CheckIterationResult(const log::Reader& reader, Status* s); + void CheckIterationResult(const log::Reader& reader, Status* s) override; ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options, const VersionEdit& edit); @@ -76,26 +168,31 @@ class VersionEditHandler { bool prefetch_index_and_filter_in_cache, bool is_initial_load); + virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } + const bool read_only_; - const std::vector& column_families_; - Status status_; + std::vector column_families_; VersionSet* version_set_; - AtomicGroupReadBuffer read_buffer_; std::unordered_map builders_; std::unordered_map name_to_options_; + // Keeps track of column families in manifest that were not found in + // column families parameters. if those column families are not dropped + // by subsequent manifest records, Recover() will return failure status. std::unordered_map column_families_not_found_; VersionEditParams version_edit_params_; const bool track_missing_files_; std::unordered_map> cf_to_missing_files_; - bool no_error_if_table_files_missing_; + std::unordered_map cf_to_missing_blob_files_high_; + bool no_error_if_files_missing_; + std::shared_ptr io_tracer_; + bool skip_load_table_files_; + bool initialized_; + std::unique_ptr> cf_to_cmp_names_; private: Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, const VersionEdit& edit); - - bool initialized_; - std::shared_ptr io_tracer_; }; // A class similar to its base class, i.e. VersionEditHandler. @@ -107,8 +204,7 @@ class VersionEditHandler { class VersionEditHandlerPointInTime : public VersionEditHandler { public: VersionEditHandlerPointInTime( - bool read_only, - const std::vector& column_families, + bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer); ~VersionEditHandlerPointInTime() override; @@ -117,10 +213,97 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) override; + virtual Status VerifyFile(const std::string& fpath, + const FileMetaData& fmeta); + virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, + const BlobFileAddition& blob_addition); - private: std::unordered_map versions_; - std::shared_ptr io_tracer_; +}; + +class ManifestTailer : public VersionEditHandlerPointInTime { + public: + explicit ManifestTailer(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer) + : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, + version_set, io_tracer), + mode_(Mode::kRecovery) {} + + void PrepareToReadNewManifest() { + initialized_ = false; + ClearReadBuffer(); + } + + std::unordered_set& GetUpdatedColumnFamilies() { + return cfds_changed_; + } + + protected: + Status Initialize() override; + + bool MustOpenAllColumnFamilies() const override { return false; } + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; + + Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override; + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + Status VerifyFile(const std::string& fpath, + const FileMetaData& fmeta) override; + + enum Mode : uint8_t { + kRecovery = 0, + kCatchUp = 1, + }; + + Mode mode_; + std::unordered_set cfds_changed_; +}; + +class DumpManifestHandler : public VersionEditHandler { + public: + DumpManifestHandler(std::vector column_families, + VersionSet* version_set, + const std::shared_ptr& io_tracer, bool verbose, + bool hex, bool json) + : VersionEditHandler( + /*read_only=*/true, column_families, version_set, + /*track_missing_files=*/false, + /*no_error_if_files_missing=*/false, io_tracer, + /*skip_load_table_files=*/true), + verbose_(verbose), + hex_(hex), + json_(json), + count_(0) { + cf_to_cmp_names_.reset(new std::unordered_map()); + } + + ~DumpManifestHandler() override {} + + Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override { + // Write out each individual edit + if (verbose_ && !json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } else if (json_) { + // Print out DebugStrings. Can include non-terminating null characters. + fwrite(edit.DebugString(hex_).data(), sizeof(char), + edit.DebugString(hex_).size(), stdout); + } + ++count_; + return VersionEditHandler::ApplyVersionEdit(edit, cfd); + } + + void CheckIterationResult(const log::Reader& reader, Status* s) override; + + private: + const bool verbose_; + const bool hex_; + const bool json_; + int count_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index ea62d9a784f..43ae6840fb1 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -8,8 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_edit.h" + #include "test_util/sync_point.h" #include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/string_util.h" @@ -317,23 +319,27 @@ TEST_F(VersionEditTest, AddWalEncodeDecode) { if (has_size) { meta.SetSyncedSizeInBytes(rand() % 1000); } - bool is_closed = rand() % 2 == 0; - if (is_closed) { - meta.SetClosed(); - } edit.AddWal(log_number, meta); } TestEncodeDecode(edit); } +static std::string PrefixEncodedWalAdditionWithLength( + const std::string& encoded) { + std::string ret; + PutVarint32(&ret, Tag::kWalAddition2); + PutLengthPrefixedSlice(&ret, encoded); + return ret; +} + TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) { std::string encoded; - PutVarint32(&encoded, Tag::kWalAddition); { // No log number. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != std::string::npos) @@ -347,8 +353,10 @@ TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) { unsigned char* ptr = reinterpret_cast(&c); *ptr = 128; encoded.append(1, c); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != std::string::npos) @@ -360,14 +368,14 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) { constexpr WalNumber kLogNumber = 100; constexpr uint64_t kSizeInBytes = 100; - std::string encoded_without_tag; - PutVarint32(&encoded_without_tag, Tag::kWalAddition); - PutVarint64(&encoded_without_tag, kLogNumber); + std::string encoded; + PutVarint64(&encoded, kLogNumber); { // No tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded_without_tag); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) << s.ToString(); @@ -375,12 +383,15 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) { { // Only has size tag, no terminate tag. - std::string encoded_with_size = encoded_without_tag; + std::string encoded_with_size = encoded; PutVarint32(&encoded_with_size, static_cast(WalAdditionTag::kSyncedSize)); PutVarint64(&encoded_with_size, kSizeInBytes); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_size); VersionEdit edit; - Status s = edit.DecodeFrom(encoded_with_size); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) << s.ToString(); @@ -388,11 +399,14 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) { { // Only has terminate tag. - std::string encoded_with_terminate = encoded_without_tag; + std::string encoded_with_terminate = encoded; PutVarint32(&encoded_with_terminate, static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_terminate); VersionEdit edit; - ASSERT_OK(edit.DecodeFrom(encoded_with_terminate)); + ASSERT_OK(edit.DecodeFrom(encoded_edit)); auto& wal_addition = edit.GetWalAdditions()[0]; ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber); ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize()); @@ -403,15 +417,15 @@ TEST_F(VersionEditTest, AddWalDecodeNoSize) { constexpr WalNumber kLogNumber = 100; std::string encoded; - PutVarint32(&encoded, Tag::kWalAddition); PutVarint64(&encoded, kLogNumber); PutVarint32(&encoded, static_cast(WalAdditionTag::kSyncedSize)); // No real size after the size tag. { // Without terminate tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") != std::string::npos) @@ -421,8 +435,10 @@ TEST_F(VersionEditTest, AddWalDecodeNoSize) { { // With terminate tag. PutVarint32(&encoded, static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); // The terminate tag is misunderstood as the size. ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) @@ -442,7 +458,7 @@ TEST_F(VersionEditTest, AddWalDebug) { const WalAdditions& wals = edit.GetWalAdditions(); - ASSERT_TRUE(edit.HasWalAddition()); + ASSERT_TRUE(edit.IsWalAddition()); ASSERT_EQ(wals.size(), n); for (int i = 0; i < n; i++) { const WalAddition& wal = wals[i]; @@ -454,7 +470,7 @@ TEST_F(VersionEditTest, AddWalDebug) { for (int i = 0; i < n; i++) { std::stringstream ss; ss << " WalAddition: log_number: " << kLogNumbers[i] - << " synced_size_in_bytes: " << kSizeInBytes[i] << " closed: 0\n"; + << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n"; expected_str += ss.str(); } expected_str += " ColumnFamily: 0\n}\n"; @@ -464,8 +480,7 @@ TEST_F(VersionEditTest, AddWalDebug) { for (int i = 0; i < n; i++) { std::stringstream ss; ss << "{\"LogNumber\": " << kLogNumbers[i] << ", " - << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << ", " - << "\"Closed\": 0}"; + << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}"; if (i < n - 1) ss << ", "; expected_json += ss.str(); } @@ -475,9 +490,7 @@ TEST_F(VersionEditTest, AddWalDebug) { TEST_F(VersionEditTest, DeleteWalEncodeDecode) { VersionEdit edit; - for (uint64_t log_number = 1; log_number <= 20; log_number++) { - edit.DeleteWal(log_number); - } + edit.DeleteWalsBefore(rand() % 100); TestEncodeDecode(edit); } @@ -486,39 +499,96 @@ TEST_F(VersionEditTest, DeleteWalDebug) { constexpr std::array kLogNumbers{{10, 20}}; VersionEdit edit; - for (int i = 0; i < n; i++) { - edit.DeleteWal(kLogNumbers[i]); - } + edit.DeleteWalsBefore(kLogNumbers[n - 1]); - const WalDeletions& wals = edit.GetWalDeletions(); + const WalDeletion& wal = edit.GetWalDeletion(); - ASSERT_TRUE(edit.HasWalDeletion()); - ASSERT_EQ(wals.size(), n); - for (int i = 0; i < n; i++) { - const WalDeletion& wal = wals[i]; - ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]); - } + ASSERT_TRUE(edit.IsWalDeletion()); + ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]); std::string expected_str = "VersionEdit {\n"; - for (int i = 0; i < n; i++) { + { std::stringstream ss; - ss << " WalDeletion: log_number: " << kLogNumbers[i] << "\n"; + ss << " WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n"; expected_str += ss.str(); } expected_str += " ColumnFamily: 0\n}\n"; ASSERT_EQ(edit.DebugString(true), expected_str); - std::string expected_json = "{\"EditNumber\": 4, \"WalDeletions\": ["; - for (int i = 0; i < n; i++) { + std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": "; + { std::stringstream ss; - ss << "{\"LogNumber\": " << kLogNumbers[i] << "}"; - if (i < n - 1) ss << ", "; + ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}"; expected_json += ss.str(); } - expected_json += "], \"ColumnFamily\": 0}"; + expected_json += ", \"ColumnFamily\": 0}"; ASSERT_EQ(edit.DebugJSON(4, true), expected_json); } +TEST_F(VersionEditTest, FullHistoryTsLow) { + VersionEdit edit; + ASSERT_FALSE(edit.HasFullHistoryTsLow()); + std::string ts = test::EncodeInt(0); + edit.SetFullHistoryTsLow(ts); + TestEncodeDecode(edit); +} + +// Tests that if RocksDB is downgraded, the new types of VersionEdits +// that have a tag larger than kTagSafeIgnoreMask can be safely ignored. +TEST_F(VersionEditTest, IgnorableTags) { + SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) { + bool* ignore = static_cast(arg); + *ignore = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t kPrevLogNumber = 100; + constexpr uint64_t kLogNumber = 200; + constexpr uint64_t kNextFileNumber = 300; + constexpr uint64_t kColumnFamilyId = 400; + + VersionEdit edit; + // Add some ignorable entries. + for (int i = 0; i < 2; i++) { + edit.AddWal(i + 1, WalMetadata(i + 2)); + } + edit.SetDBId("db_id"); + // Add unignorable entries. + edit.SetPrevLogNumber(kPrevLogNumber); + edit.SetLogNumber(kLogNumber); + // Add more ignorable entries. + edit.DeleteWalsBefore(100); + // Add unignorable entry. + edit.SetNextFile(kNextFileNumber); + // Add more ignorable entries. + edit.SetFullHistoryTsLow("ts"); + // Add unignorable entry. + edit.SetColumnFamily(kColumnFamilyId); + + std::string encoded; + ASSERT_TRUE(edit.EncodeTo(&encoded)); + + VersionEdit decoded; + ASSERT_OK(decoded.DecodeFrom(encoded)); + + // Check that all ignorable entries are ignored. + ASSERT_FALSE(decoded.HasDbId()); + ASSERT_FALSE(decoded.HasFullHistoryTsLow()); + ASSERT_FALSE(decoded.IsWalAddition()); + ASSERT_FALSE(decoded.IsWalDeletion()); + ASSERT_TRUE(decoded.GetWalAdditions().empty()); + ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty()); + + // Check that unignorable entries are still present. + ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber); + ASSERT_EQ(edit.GetLogNumber(), kLogNumber); + ASSERT_EQ(edit.GetNextFile(), kNextFileNumber); + ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId); + + SyncPoint::GetInstance()->DisableProcessing(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/version_set.cc b/db/version_set.cc index b6550dd60c7..e1964482143 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,11 +9,10 @@ #include "db/version_set.h" -#include - #include #include #include +#include #include #include #include @@ -22,6 +21,10 @@ #include #include "compaction/compaction.h" +#include "db/blob/blob_fetcher.h" +#include "db/blob/blob_file_cache.h" +#include "db/blob/blob_file_reader.h" +#include "db/blob/blob_index.h" #include "db/internal_stats.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -39,6 +42,7 @@ #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/write_buffer_manager.h" @@ -92,7 +96,8 @@ Status OverlapWithIterator(const Comparator* ucmp, *overlap = false; if (iter->Valid()) { ParsedInternalKey seek_result; - Status s = ParseInternalKey(iter->key(), &seek_result); + Status s = ParseInternalKey(iter->key(), &seek_result, + false /* log_err_key */); // TODO if (!s.ok()) return s; if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= @@ -404,7 +409,7 @@ class FilePickerMultiGet { int GetCurrentLevel() const { return curr_level_; } // Iterates through files in the current level until it finds a file that - // contains atleast one key from the MultiGet batch + // contains at least one key from the MultiGet batch bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range, size_t* file_index, FdWithKeyRange** fd, bool* is_last_key_in_file) { @@ -436,7 +441,7 @@ class FilePickerMultiGet { !file_hit)) { struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level]; - Slice& user_key = batch_iter_->ukey; + Slice& user_key = batch_iter_->ukey_without_ts; // Do key range filtering of files or/and fractional cascading if: // (1) not all the files are in level 0, or @@ -450,17 +455,17 @@ class FilePickerMultiGet { // Check if key is within a file's range. If search left bound and // right bound point to the same find, we are sure key falls in // range. + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->smallest_key), true); + assert(curr_level_ == 0 || fp_ctx.curr_index_in_curr_level == fp_ctx.start_index_in_curr_level || - user_comparator_->Compare(user_key, - ExtractUserKey(f->smallest_key)) <= 0); + cmp_smallest <= 0); - int cmp_smallest = user_comparator_->Compare( - user_key, ExtractUserKey(f->smallest_key)); if (cmp_smallest >= 0) { - cmp_largest = user_comparator_->Compare( - user_key, ExtractUserKey(f->largest_key)); + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key, false, ExtractUserKey(f->largest_key), true); } else { cmp_largest = -1; } @@ -493,8 +498,9 @@ class FilePickerMultiGet { upper_key_ = batch_iter_; ++upper_key_; while (upper_key_ != current_level_range_.end() && - user_comparator_->Compare(batch_iter_->ukey, upper_key_->ukey) == - 0) { + user_comparator_->CompareWithoutTimestamp( + batch_iter_->ukey_without_ts, false, + upper_key_->ukey_without_ts, false) == 0) { ++upper_key_; } break; @@ -1091,13 +1097,17 @@ void LevelIterator::Seek(const Slice& target) { // next key after the prefix, or make the iterator invalid. // A side benefit will be that it invalidates the iterator earlier so that // the upper level merging iterator can merge fewer child iterators. - Slice target_user_key = ExtractUserKey(target); - Slice file_user_key = ExtractUserKey(file_iter_.key()); - if (prefix_extractor_->InDomain(target_user_key) && - (!prefix_extractor_->InDomain(file_user_key) || - user_comparator_.Compare( - prefix_extractor_->Transform(target_user_key), - prefix_extractor_->Transform(file_user_key)) != 0)) { + size_t ts_sz = user_comparator_.timestamp_size(); + Slice target_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(target, ts_sz); + Slice file_user_key_without_ts = + ExtractUserKeyAndStripTimestamp(file_iter_.key(), ts_sz); + if (prefix_extractor_->InDomain(target_user_key_without_ts) && + (!prefix_extractor_->InDomain(file_user_key_without_ts) || + user_comparator_.CompareWithoutTimestamp( + prefix_extractor_->Transform(target_user_key_without_ts), false, + prefix_extractor_->Transform(file_user_key_without_ts), + false) != 0)) { SetFileIterator(nullptr); } } @@ -1293,7 +1303,7 @@ Status Version::GetTableProperties(std::shared_ptr* tp, if (!s.ok()) { return s; } - RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); *tp = std::shared_ptr(raw_table_properties); return s; @@ -1476,15 +1486,16 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file_path = ioptions->cf_paths.back().path; } const uint64_t file_number = file->fd.GetNumber(); - files.emplace_back(SstFileMetaData{ + files.emplace_back( MakeTableFileName("", file_number), file_number, file_path, static_cast(file->fd.GetFileSize()), file->fd.smallest_seqno, file->fd.largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), file->stats.num_reads_sampled.load(std::memory_order_relaxed), - file->being_compacted, file->oldest_blob_file_number, - file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(), - file->file_checksum, file->file_checksum_func_name}); + file->being_compacted, file->temperature, + file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), + file->TryGetFileCreationTime(), file->file_checksum, + file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); @@ -1752,13 +1763,14 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, const std::shared_ptr& io_tracer, uint64_t version_number) : env_(vset->env_), + clock_(vset->clock_), cfd_(column_family_data), - info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log), - db_statistics_((cfd_ == nullptr) ? nullptr - : cfd_->ioptions()->statistics), + info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger), + db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats), table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), - merge_operator_((cfd_ == nullptr) ? nullptr - : cfd_->ioptions()->merge_operator), + blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr), + merge_operator_( + (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()), storage_info_( (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), @@ -1780,6 +1792,62 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, version_number_(version_number), io_tracer_(io_tracer) {} +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, PinnableSlice* value, + uint64_t* bytes_read) const { + if (read_options.read_tier == kBlockCacheTier) { + return Status::Incomplete("Cannot read blob: no disk I/O allowed"); + } + + BlobIndex blob_index; + + { + Status s = blob_index.DecodeFrom(blob_index_slice); + if (!s.ok()) { + return s; + } + } + + return GetBlob(read_options, user_key, blob_index, value, bytes_read); +} + +Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, PinnableSlice* value, + uint64_t* bytes_read) const { + assert(value); + + if (blob_index.HasTTL() || blob_index.IsInlined()) { + return Status::Corruption("Unexpected TTL/inlined blob index"); + } + + const auto& blob_files = storage_info_.GetBlobFiles(); + + const uint64_t blob_file_number = blob_index.file_number(); + + const auto it = blob_files.find(blob_file_number); + if (it == blob_files.end()) { + return Status::Corruption("Invalid blob file number"); + } + + CacheHandleGuard blob_file_reader; + + { + assert(blob_file_cache_); + const Status s = blob_file_cache_->GetBlobFileReader(blob_file_number, + &blob_file_reader); + if (!s.ok()) { + return s; + } + } + + assert(blob_file_reader.GetValue()); + const Status s = blob_file_reader.GetValue()->GetBlob( + read_options, user_key, blob_index.offset(), blob_index.size(), + blob_index.compression(), value, bytes_read); + + return s; +} + void Version::Get(const ReadOptions& read_options, const LookupKey& k, PinnableSlice* value, std::string* timestamp, Status* status, MergeContext* merge_context, @@ -1802,13 +1870,21 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, vset_->block_cache_tracer_->is_tracing_enabled()) { tracing_get_id = vset_->block_cache_tracer_->NextGetId(); } + + // Note: the old StackableDB-based BlobDB passes in + // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we + // need to provide it here. + bool is_blob_index = false; + bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index; + BlobFetcher blob_fetcher(this, read_options); + GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found, - merge_context, do_merge, max_covering_tombstone_seq, this->env_, seq, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, - tracing_get_id); + merge_context, do_merge, max_covering_tombstone_seq, clock_, seq, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use, + tracing_get_id, &blob_fetcher); // Pin blocks that we read to hold merge operands if (merge_operator_) { @@ -1834,7 +1910,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && get_perf_context()->per_level_perf_context_enabled; - StopWatchNano timer(env_, timer_enabled /* auto_start */); + StopWatchNano timer(clock_, timer_enabled /* auto_start */); *status = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, &get_context, mutable_cf_options_.prefix_extractor.get(), @@ -1872,8 +1948,25 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } else if (fp.GetHitFileLevel() >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); + + if (is_blob_index) { + if (do_merge && value) { + constexpr uint64_t* bytes_read = nullptr; + + *status = + GetBlob(read_options, user_key, *value, value, bytes_read); + if (!status->ok()) { + if (status->IsIncomplete()) { + get_context.MarkKeyMayExist(); + } + return; + } + } + } + return; case GetContext::kDeleted: // Use empty error message for speed @@ -1882,7 +1975,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, case GetContext::kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); return; - case GetContext::kBlobIndex: + case GetContext::kUnexpectedBlobIndex: ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); *status = Status::NotSupported( "Encounter unexpected blob index. Please open DB with " @@ -1909,7 +2002,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, std::string* str_value = value != nullptr ? value->GetSelf() : nullptr; *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, nullptr, merge_context->GetOperands(), - str_value, info_log_, db_statistics_, env_, + str_value, info_log_, db_statistics_, clock_, nullptr /* result_operand */, true); if (LIKELY(value != nullptr)) { value->PinSelf(); @@ -1923,7 +2016,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, - ReadCallback* callback, bool* is_blob) { + ReadCallback* callback) { PinnedIteratorsManager pinned_iters_mgr; // Pin blocks that we read to hold merge operands @@ -1940,15 +2033,16 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, // use autovector in order to avoid unnecessary construction of GetContext // objects, which is expensive autovector get_ctx; + BlobFetcher blob_fetcher(this, read_options); for (auto iter = range->begin(); iter != range->end(); ++iter) { assert(iter->s->ok() || iter->s->IsMergeInProgress()); get_ctx.emplace_back( user_comparator(), merge_operator_, info_log_, db_statistics_, - iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey, - iter->value, iter->timestamp, nullptr, &(iter->merge_context), true, - &iter->max_covering_tombstone_seq, this->env_, nullptr, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, - tracing_mget_id); + iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, + iter->ukey_with_ts, iter->value, iter->timestamp, nullptr, + &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_, + nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, + &iter->is_blob_index, tracing_mget_id, &blob_fetcher); // MergeInProgress status, if set, has been transferred to the get_context // state, so we set status to ok here. From now on, the iter status will // be used for IO errors, and get_context state will be used for any @@ -1978,7 +2072,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && get_perf_context()->per_level_perf_context_enabled; - StopWatchNano timer(env_, timer_enabled /* auto_start */); + StopWatchNano timer(clock_, timer_enabled /* auto_start */); s = table_cache_->MultiGet( read_options, *internal_comparator(), *f->file_metadata, &file_range, mutable_cf_options_.prefix_extractor.get(), @@ -2050,10 +2144,29 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, } else if (fp.GetHitFileLevel() >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); - file_range.AddValueSize(iter->value->size()); + file_range.MarkKeyDone(iter); + + if (iter->is_blob_index) { + if (iter->value) { + constexpr uint64_t* bytes_read = nullptr; + + *status = GetBlob(read_options, iter->ukey_with_ts, *iter->value, + iter->value, bytes_read); + if (!status->ok()) { + if (status->IsIncomplete()) { + get_context.MarkKeyMayExist(); + } + + continue; + } + } + } + + file_range.AddValueSize(iter->value->size()); if (file_range.GetValueSize() > read_options.value_size_soft_limit) { s = Status::Aborted(); break; @@ -2069,7 +2182,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, Status::Corruption("corrupted key for ", iter->lkey->user_key()); file_range.MarkKeyDone(iter); continue; - case GetContext::kBlobIndex: + case GetContext::kUnexpectedBlobIndex: ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index."); *status = Status::NotSupported( "Encounter unexpected blob index. Please open DB with " @@ -2124,7 +2237,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, iter->value != nullptr ? iter->value->GetSelf() : nullptr; *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), - str_value, info_log_, db_statistics_, env_, + str_value, info_log_, db_statistics_, clock_, nullptr /* result_operand */, true); if (LIKELY(iter->value != nullptr)) { iter->value->PinSelf(); @@ -2417,13 +2530,13 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded( } namespace { -uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, +uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const std::vector& files) { uint32_t ttl_expired_files_count = 0; int64_t _current_time; - auto status = ioptions.env->GetCurrentTime(&_current_time); + auto status = ioptions.clock->GetCurrentTime(&_current_time); if (status.ok()) { const uint64_t current_time = static_cast(_current_time); for (FileMetaData* f : files) { @@ -2441,7 +2554,7 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, } // anonymous namespace void VersionStorageInfo::ComputeCompactionScore( - const ImmutableCFOptions& immutable_cf_options, + const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options) { for (int level = 0; level <= MaxInputLevel(); level++) { double score; @@ -2493,7 +2606,7 @@ void VersionStorageInfo::ComputeCompactionScore( if (mutable_cf_options.ttl > 0) { score = std::max( static_cast(GetExpiredTtlFilesCount( - immutable_cf_options, mutable_cf_options, files_[level])), + immutable_options, mutable_cf_options, files_[level])), score); } @@ -2505,7 +2618,7 @@ void VersionStorageInfo::ComputeCompactionScore( // L0 files. Take into account size as well to avoid later giant // compactions to the base level. uint64_t l0_target_size = mutable_cf_options.max_bytes_for_level_base; - if (immutable_cf_options.level_compaction_dynamic_level_bytes && + if (immutable_options.level_compaction_dynamic_level_bytes && level_multiplier_ != 0.0) { // Prevent L0 to Lbase fanout from growing larger than // `level_multiplier_`. This prevents us from getting stuck picking @@ -2553,11 +2666,11 @@ void VersionStorageInfo::ComputeCompactionScore( ComputeFilesMarkedForCompaction(); ComputeBottommostFilesMarkedForCompaction(); if (mutable_cf_options.ttl > 0) { - ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl); + ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); } if (mutable_cf_options.periodic_compaction_seconds > 0) { ComputeFilesMarkedForPeriodicCompaction( - immutable_cf_options, mutable_cf_options.periodic_compaction_seconds); + immutable_options, mutable_cf_options.periodic_compaction_seconds); } EstimateCompactionBytesNeeded(mutable_cf_options); } @@ -2586,13 +2699,13 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() { } void VersionStorageInfo::ComputeExpiredTtlFiles( - const ImmutableCFOptions& ioptions, const uint64_t ttl) { + const ImmutableOptions& ioptions, const uint64_t ttl) { assert(ttl > 0); expired_ttl_files_.clear(); int64_t _current_time; - auto status = ioptions.env->GetCurrentTime(&_current_time); + auto status = ioptions.clock->GetCurrentTime(&_current_time); if (!status.ok()) { return; } @@ -2612,14 +2725,14 @@ void VersionStorageInfo::ComputeExpiredTtlFiles( } void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const uint64_t periodic_compaction_seconds) { assert(periodic_compaction_seconds > 0); files_marked_for_periodic_compaction_.clear(); int64_t temp_current_time; - auto status = ioptions.env->GetCurrentTime(&temp_current_time); + auto status = ioptions.clock->GetCurrentTime(&temp_current_time); if (!status.ok()) { return; } @@ -2653,7 +2766,7 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( status = ioptions.env->GetFileModificationTime( file_path, &file_modification_time); if (!status.ok()) { - ROCKS_LOG_WARN(ioptions.info_log, + ROCKS_LOG_WARN(ioptions.logger, "Can't get file modification time: %s: %s", file_path.c_str(), status.ToString().c_str()); continue; @@ -2676,7 +2789,7 @@ struct Fsize { FileMetaData* file; }; -// Compator that is used to sort files based on their size +// Comparator that is used to sort files based on their size // In normal mode: descending size bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { return (first.file->compensated_file_size > @@ -3096,7 +3209,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval( // specified range. From that file, iterate backwards and // forwards to find all overlapping files. // if within_range is set, then only store the maximum clean inputs -// within range [begin, end]. "clean" means there is a boudnary +// within range [begin, end]. "clean" means there is a boundary // between the files in "*inputs" and the surrounding files void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( int level, const InternalKey* begin, const InternalKey* end, @@ -3286,7 +3399,7 @@ uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const { return level_max_bytes_[level]; } -void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, +void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, const MutableCFOptions& options) { // Special logic to set number of sorted runs. // It is to match the previous behavior when all files are in L0. @@ -3376,7 +3489,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, // base_bytes_min. We set it be base_bytes_min. base_level_size = base_bytes_min + 1U; base_level_ = first_non_empty_level; - ROCKS_LOG_INFO(ioptions.info_log, + ROCKS_LOG_INFO(ioptions.logger, "More existing levels in DB than needed. " "max_bytes_for_level_multiplier may not be guaranteed."); } else { @@ -3407,7 +3520,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, // 1. the L0 size is larger than level size base, or // 2. number of L0 files reaches twice the L0->L1 compaction trigger // We don't do this otherwise to keep the LSM-tree structure stable - // unless the L0 compation is backlogged. + // unless the L0 compaction is backlogged. base_level_size = l0_size; if (base_level_ == num_levels_ - 1) { level_multiplier_ = 1.0; @@ -3593,16 +3706,30 @@ struct VersionSet::ManifestWriter { ColumnFamilyData* cfd; const MutableCFOptions mutable_cf_options; const autovector& edit_list; + const std::function manifest_write_callback; - explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd, - const MutableCFOptions& cf_options, - const autovector& e) + explicit ManifestWriter( + InstrumentedMutex* mu, ColumnFamilyData* _cfd, + const MutableCFOptions& cf_options, const autovector& e, + const std::function& manifest_wcb) : done(false), cv(mu), cfd(_cfd), mutable_cf_options(cf_options), - edit_list(e) {} + edit_list(e), + manifest_write_callback(manifest_wcb) {} ~ManifestWriter() { status.PermitUncheckedError(); } + + bool IsAllWalEdits() const { + bool all_wal_edits = true; + for (const auto& e : edit_list) { + if (!e->IsWalManipulation()) { + all_wal_edits = false; + break; + } + } + return all_wal_edits; + } }; Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { @@ -3656,13 +3783,16 @@ VersionSet::VersionSet(const std::string& dbname, WriteBufferManager* write_buffer_manager, WriteController* write_controller, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& db_session_id) : column_family_set_( new ColumnFamilySet(dbname, _db_options, storage_options, table_cache, write_buffer_manager, write_controller, - block_cache_tracer, io_tracer)), + block_cache_tracer, io_tracer, db_session_id)), + table_cache_(table_cache), env_(_db_options->env), fs_(_db_options->fs, io_tracer), + clock_(_db_options->clock), dbname_(dbname), db_options_(_db_options), next_file_number_(2), @@ -3677,17 +3807,17 @@ VersionSet::VersionSet(const std::string& dbname, manifest_file_size_(0), file_options_(storage_options), block_cache_tracer_(block_cache_tracer), - io_tracer_(io_tracer) {} + io_tracer_(io_tracer), + db_session_id_(db_session_id) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on // VersionSet - Cache* table_cache = column_family_set_->get_table_cache(); column_family_set_.reset(); for (auto& file : obsolete_files_) { if (file.metadata->table_reader_handle) { - table_cache->Release(file.metadata->table_reader_handle); - TableCache::Evict(table_cache, file.metadata->fd.GetNumber()); + table_cache_->Release(file.metadata->table_reader_handle); + TableCache::Evict(table_cache_, file.metadata->fd.GetNumber()); } file.DeleteMetadata(); } @@ -3697,12 +3827,11 @@ VersionSet::~VersionSet() { void VersionSet::Reset() { if (column_family_set_) { - Cache* table_cache = column_family_set_->get_table_cache(); WriteBufferManager* wbm = column_family_set_->write_buffer_manager(); WriteController* wc = column_family_set_->write_controller(); - column_family_set_.reset( - new ColumnFamilySet(dbname_, db_options_, file_options_, table_cache, - wbm, wc, block_cache_tracer_, io_tracer_)); + column_family_set_.reset(new ColumnFamilySet( + dbname_, db_options_, file_options_, table_cache_, wbm, wc, + block_cache_tracer_, io_tracer_, db_session_id_)); } db_id_.clear(); next_file_number_.store(2); @@ -3755,6 +3884,7 @@ Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options) { + mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); ManifestWriter* last_writer = &first_writer; @@ -3831,16 +3961,22 @@ Status VersionSet::ProcessManifestWrites( } } if (version == nullptr) { - version = new Version(last_writer->cfd, this, file_options_, - last_writer->mutable_cf_options, io_tracer_, - current_version_number_++); - versions.push_back(version); - mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); - builder_guards.emplace_back( - new BaseReferencedVersionBuilder(last_writer->cfd)); - builder = builder_guards.back()->version_builder(); + // WAL manipulations do not need to be applied to versions. + if (!last_writer->IsAllWalEdits()) { + version = new Version(last_writer->cfd, this, file_options_, + last_writer->mutable_cf_options, io_tracer_, + current_version_number_++); + versions.push_back(version); + mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options); + builder_guards.emplace_back( + new BaseReferencedVersionBuilder(last_writer->cfd)); + builder = builder_guards.back()->version_builder(); + } + assert(last_writer->IsAllWalEdits() || builder); + assert(last_writer->IsAllWalEdits() || version); + TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion", + version); } - assert(builder != nullptr); // make checker happy for (const auto& e : last_writer->edit_list) { if (e->is_in_atomic_group_) { if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || @@ -3926,6 +4062,7 @@ Status VersionSet::ProcessManifestWrites( // reads its content after releasing db mutex to avoid race with // SwitchMemtable(). std::unordered_map curr_state; + VersionEdit wal_additions; if (new_descriptor_log) { pending_manifest_file_number_ = NewFileNumber(); batch_edits.back()->SetNextFile(next_file_number_.load()); @@ -3938,18 +4075,25 @@ Status VersionSet::ProcessManifestWrites( } for (const auto* cfd : *column_family_set_) { assert(curr_state.find(cfd->GetID()) == curr_state.end()); - curr_state[cfd->GetID()] = {cfd->GetLogNumber()}; + curr_state.emplace(std::make_pair( + cfd->GetID(), + MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow()))); + } + + for (const auto& wal : wals_.GetWals()) { + wal_additions.AddWal(wal.first, wal.second); } } uint64_t new_manifest_file_size = 0; Status s; IOStatus io_s; + IOStatus manifest_io_status; { FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_); mu->Unlock(); - - TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest"); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart"); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr); if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) { for (int i = 0; i < static_cast(versions.size()); ++i) { assert(!builder_guards.empty() && @@ -3986,15 +4130,17 @@ Status VersionSet::ProcessManifestWrites( if (io_s.ok()) { descriptor_file->SetPreallocationBlockSize( db_options_->manifest_preallocation_size); - + FileTypeSet tmp_set = db_options_->checksum_handoff_file_types; std::unique_ptr file_writer(new WritableFileWriter( - std::move(descriptor_file), descriptor_fname, opt_file_opts, env_, - io_tracer_, nullptr, db_options_->listeners)); + std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_, + io_tracer_, nullptr, db_options_->listeners, nullptr, + tmp_set.Contains(FileType::kDescriptorFile))); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); - s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get(), - io_s); + s = WriteCurrentStateToManifest(curr_state, wal_additions, + descriptor_log_.get(), io_s); } else { + manifest_io_status = io_s; s = io_s; } } @@ -4017,8 +4163,8 @@ Status VersionSet::ProcessManifestWrites( e->DebugString(true)); break; } - TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord", - rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord", + REDUCE_ODDS2); #ifndef NDEBUG if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { TEST_SYNC_POINT_CALLBACK( @@ -4032,12 +4178,14 @@ Status VersionSet::ProcessManifestWrites( io_s = descriptor_log_->AddRecord(record); if (!io_s.ok()) { s = io_s; + manifest_io_status = io_s; break; } } if (s.ok()) { if (!db_options_->disable_manifest_sync) { - io_s = SyncManifest(env_, db_options_, descriptor_log_->file()); + io_s = SyncManifest(db_options_, descriptor_log_->file()); + manifest_io_status = io_s; } TEST_SYNC_POINT_CALLBACK( "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); @@ -4051,6 +4199,9 @@ Status VersionSet::ProcessManifestWrites( // If we just created a new descriptor file, install it by writing a // new CURRENT file that points to it. + if (s.ok()) { + assert(manifest_io_status.ok()); + } if (s.ok() && new_descriptor_log) { io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_, db_directory); @@ -4076,6 +4227,20 @@ Status VersionSet::ProcessManifestWrites( mu->Lock(); } + if (s.ok()) { + // Apply WAL edits, DB mutex must be held. + for (auto& e : batch_edits) { + if (e->IsWalAddition()) { + s = wals_.AddWals(e->GetWalAdditions()); + } else if (e->IsWalDeletion()) { + s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber()); + } + if (!s.ok()) { + break; + } + } + } + if (!io_s.ok()) { if (io_status_.ok()) { io_status_ = io_s; @@ -4105,23 +4270,23 @@ Status VersionSet::ProcessManifestWrites( // Each version in versions corresponds to a column family. // For each column family, update its log number indicating that logs // with number smaller than this should be ignored. - for (const auto version : versions) { - uint64_t max_log_number_in_batch = 0; - uint32_t cf_id = version->cfd_->GetID(); - for (const auto& e : batch_edits) { - if (e->has_log_number_ && e->column_family_ == cf_id) { - max_log_number_in_batch = - std::max(max_log_number_in_batch, e->log_number_); - } + uint64_t last_min_log_number_to_keep = 0; + for (const auto& e : batch_edits) { + ColumnFamilyData* cfd = nullptr; + if (!e->IsColumnFamilyManipulation()) { + cfd = column_family_set_->GetColumnFamily(e->column_family_); + // e would not have been added to batch_edits if its corresponding + // column family is dropped. + assert(cfd); } - if (max_log_number_in_batch != 0) { - assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch); - version->cfd_->SetLogNumber(max_log_number_in_batch); + if (cfd) { + if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) { + cfd->SetLogNumber(e->log_number_); + } + if (e->HasFullHistoryTsLow()) { + cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow()); + } } - } - - uint64_t last_min_log_number_to_keep = 0; - for (auto& e : batch_edits) { if (e->has_min_log_number_to_keep_) { last_min_log_number_to_keep = std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_); @@ -4152,11 +4317,41 @@ Status VersionSet::ProcessManifestWrites( for (auto v : versions) { delete v; } + if (manifest_io_status.ok()) { + manifest_file_number_ = pending_manifest_file_number_; + manifest_file_size_ = new_manifest_file_size; + } // If manifest append failed for whatever reason, the file could be // corrupted. So we need to force the next version update to start a // new manifest file. descriptor_log_.reset(); - if (new_descriptor_log) { + // If manifest operations failed, then we know the CURRENT file still + // points to the original MANIFEST. Therefore, we can safely delete the + // new MANIFEST. + // If manifest operations succeeded, and we are here, then it is possible + // that renaming tmp file to CURRENT failed. + // + // On local POSIX-compliant FS, the CURRENT must point to the original + // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also + // keep it. Future recovery will ignore this MANIFEST. It's also ok for the + // process not to crash and continue using the db. Any future LogAndApply() + // call will switch to a new MANIFEST and update CURRENT, still ignoring + // this one. + // + // On non-local FS, it is + // possible that the rename operation succeeded on the server (remote) + // side, but the client somehow returns a non-ok status to RocksDB. Note + // that this does not violate atomicity. Should we delete the new MANIFEST + // successfully, a subsequent recovery attempt will likely see the CURRENT + // pointing to the new MANIFEST, thus fail. We will not be able to open the + // DB again. Therefore, if manifest operations succeed, we should keep the + // the new MANIFEST. If the process proceeds, any future LogAndApply() call + // will switch to a new MANIFEST and update CURRENT. If user tries to + // re-open the DB, + // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present. + // b) CURRENT points to the original MANIFEST, and the original MANIFEST + // also exists. + if (new_descriptor_log && !manifest_io_status.ok()) { ROCKS_LOG_INFO(db_options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", @@ -4187,6 +4382,9 @@ Status VersionSet::ProcessManifestWrites( } ready->status = s; ready->done = true; + if (ready->manifest_write_callback) { + (ready->manifest_write_callback)(s); + } if (need_signal) { ready->cv.Signal(); } @@ -4200,14 +4398,15 @@ Status VersionSet::ProcessManifestWrites( return s; } -// 'datas' is gramatically incorrect. We still use this notation to indicate +// 'datas' is grammatically incorrect. We still use this notation to indicate // that this variable represents a collection of column_family_data. Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options) { + const ColumnFamilyOptions* new_cf_options, + const std::vector>& manifest_wcbs) { mu->AssertHeld(); int num_edits = 0; for (const auto& elist : edit_lists) { @@ -4237,12 +4436,16 @@ Status VersionSet::LogAndApply( assert(static_cast(num_cfds) == edit_lists.size()); } for (int i = 0; i < num_cfds; ++i) { + const auto wcb = + manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i]; writers.emplace_back(mu, column_family_datas[i], - *mutable_cf_options_list[i], edit_lists[i]); + *mutable_cf_options_list[i], edit_lists[i], wcb); manifest_writers_.push_back(&writers[i]); } assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting", + nullptr); while (!first_writer.done && &first_writer != manifest_writers_.front()) { first_writer.cv.Wait(); } @@ -4254,6 +4457,7 @@ Status VersionSet::LogAndApply( for (const auto& writer : writers) { assert(writer.done); } + TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu); #endif /* !NDEBUG */ return first_writer.status; } @@ -4323,153 +4527,11 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_ : last_sequence_); - Status s = builder->Apply(edit); - - return s; -} - -Status VersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, - const std::unordered_map& name_to_options, - std::unordered_map& column_families_not_found, - std::unordered_map>& - builders, - VersionEditParams* version_edit_params) { - // Not found means that user didn't supply that column - // family option AND we encountered column family add - // record. Once we encounter column family drop record, - // we will delete the column family from - // column_families_not_found. - bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) != - column_families_not_found.end()); - // in builders means that user supplied that column family - // option AND that we encountered column family add record - bool cf_in_builders = builders.find(edit.column_family_) != builders.end(); - - // they can't both be true - assert(!(cf_in_not_found && cf_in_builders)); - - ColumnFamilyData* cfd = nullptr; - - if (edit.is_column_family_add_) { - if (cf_in_builders || cf_in_not_found) { - return Status::Corruption( - "Manifest adding the same column family twice: " + - edit.column_family_name_); - } - auto cf_options = name_to_options.find(edit.column_family_name_); - // implicitly add persistent_stats column family without requiring user - // to specify - bool is_persistent_stats_column_family = - edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; - if (cf_options == name_to_options.end() && - !is_persistent_stats_column_family) { - column_families_not_found.insert( - {edit.column_family_, edit.column_family_name_}); - } else { - // recover persistent_stats CF from a DB that already contains it - if (is_persistent_stats_column_family) { - ColumnFamilyOptions cfo; - OptimizeForPersistentStats(&cfo); - cfd = CreateColumnFamily(cfo, &edit); - } else { - cfd = CreateColumnFamily(cf_options->second, &edit); - } - cfd->set_initialized(); - builders.insert(std::make_pair( - edit.column_family_, std::unique_ptr( - new BaseReferencedVersionBuilder(cfd)))); - } - } else if (edit.is_column_family_drop_) { - if (cf_in_builders) { - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - builders.erase(builder); - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - assert(cfd != nullptr); - if (cfd->UnrefAndTryDelete()) { - cfd = nullptr; - } else { - // who else can have reference to cfd!? - assert(false); - } - } else if (cf_in_not_found) { - column_families_not_found.erase(edit.column_family_); - } else { - return Status::Corruption( - "Manifest - dropping non-existing column family"); - } - } else if (!cf_in_not_found) { - if (!cf_in_builders) { - return Status::Corruption( - "Manifest record referencing unknown column family"); - } - - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // this should never happen since cf_in_builders is true - assert(cfd != nullptr); - - // if it is not column family add or column family drop, - // then it's a file add/delete, which should be forwarded - // to builder - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - Status s = builder->second->version_builder()->Apply(&edit); - if (!s.ok()) { - return s; - } - } - return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params); -} - -Status VersionSet::ExtractInfoFromVersionEdit( - ColumnFamilyData* cfd, const VersionEdit& from_edit, - VersionEditParams* version_edit_params) { - if (cfd != nullptr) { - if (from_edit.has_db_id_) { - version_edit_params->SetDBId(from_edit.db_id_); - } - if (from_edit.has_log_number_) { - if (cfd->GetLogNumber() > from_edit.log_number_) { - ROCKS_LOG_WARN( - db_options_->info_log, - "MANIFEST corruption detected, but ignored - Log numbers in " - "records NOT monotonically increasing"); - } else { - cfd->SetLogNumber(from_edit.log_number_); - version_edit_params->SetLogNumber(from_edit.log_number_); - } - } - if (from_edit.has_comparator_ && - from_edit.comparator_ != cfd->user_comparator()->Name()) { - return Status::InvalidArgument( - cfd->user_comparator()->Name(), - "does not match existing comparator " + from_edit.comparator_); - } - } - - if (from_edit.has_prev_log_number_) { - version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_); - } - - if (from_edit.has_next_file_number_) { - version_edit_params->SetNextFile(from_edit.next_file_number_); - } - - if (from_edit.has_max_column_family_) { - version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_); - } - - if (from_edit.has_min_log_number_to_keep_) { - version_edit_params->min_log_number_to_keep_ = - std::max(version_edit_params->min_log_number_to_keep_, - from_edit.min_log_number_to_keep_); - } - - if (from_edit.has_last_sequence_) { - version_edit_params->SetLastSequence(from_edit.last_sequence_); - } - return Status::OK(); + // The builder can be nullptr only if edit is WAL manipulation, + // because WAL edits do not need to be applied to versions, + // we return Status::OK() in this case. + assert(builder || edit->IsWalManipulation()); + return builder ? builder->Apply(edit) : Status::OK(); } Status VersionSet::GetCurrentManifestPath(const std::string& dbname, @@ -4503,89 +4565,9 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, return Status::OK(); } -Status VersionSet::ReadAndRecover( - log::Reader& reader, AtomicGroupReadBuffer* read_buffer, - const std::unordered_map& name_to_options, - std::unordered_map& column_families_not_found, - std::unordered_map>& - builders, - Status* log_read_status, VersionEditParams* version_edit_params, - std::string* db_id) { - assert(read_buffer != nullptr); - assert(log_read_status != nullptr); - Status s; - Slice record; - std::string scratch; - size_t recovered_edits = 0; - while (s.ok() && reader.ReadRecord(&record, &scratch) && - log_read_status->ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - if (edit.has_db_id_) { - db_id_ = edit.GetDbId(); - if (db_id != nullptr) { - db_id->assign(edit.GetDbId()); - } - } - s = read_buffer->AddEdit(&edit); - if (!s.ok()) { - break; - } - if (edit.is_in_atomic_group_) { - if (read_buffer->IsFull()) { - // Apply edits in an atomic group when we have read all edits in the - // group. - for (auto& e : read_buffer->replay_buffer()) { - s = ApplyOneVersionEditToBuilder(e, name_to_options, - column_families_not_found, builders, - version_edit_params); - if (!s.ok()) { - break; - } - recovered_edits++; - } - if (!s.ok()) { - break; - } - read_buffer->Clear(); - } - } else { - // Apply a normal edit immediately. - s = ApplyOneVersionEditToBuilder(edit, name_to_options, - column_families_not_found, builders, - version_edit_params); - if (s.ok()) { - recovered_edits++; - } - } - } - if (!log_read_status->ok()) { - s = *log_read_status; - } - if (!s.ok()) { - // Clear the buffer if we fail to decode/apply an edit. - read_buffer->Clear(); - } - TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits", - &recovered_edits); - return s; -} - Status VersionSet::Recover( const std::vector& column_families, bool read_only, std::string* db_id) { - std::unordered_map cf_name_to_options; - for (const auto& cf : column_families) { - cf_name_to_options.emplace(cf.name, cf.options); - } - // keeps track of column families in manifest that were not found in - // column families parameters. if those column families are not dropped - // by subsequent manifest records, Recover() will return failure status - std::unordered_map column_families_not_found; - // Read "CURRENT" file, which contains a pointer to the current manifest file std::string manifest_path; Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, @@ -4610,139 +4592,30 @@ Status VersionSet::Recover( new SequentialFileReader(std::move(manifest_file), manifest_path, db_options_->log_readahead_size, io_tracer_)); } - - VersionBuilderMap builders; - - // add default column family - auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); - if (default_cf_iter == cf_name_to_options.end()) { - return Status::InvalidArgument("Default column family not specified"); - } - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(default_cf_iter->second, &default_cf_edit); - // In recovery, nobody else can access it, so it's fine to set it to be - // initialized earlier. - default_cfd->set_initialized(); - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); uint64_t current_manifest_file_size = 0; - VersionEditParams version_edit_params; + uint64_t log_number = 0; { VersionSet::LogReporter reporter; Status log_read_status; reporter.status = &log_read_status; log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, true /* checksum */, 0 /* log_number */); - AtomicGroupReadBuffer read_buffer; - s = ReadAndRecover(reader, &read_buffer, cf_name_to_options, - column_families_not_found, builders, &log_read_status, - &version_edit_params, db_id); - current_manifest_file_size = reader.GetReadOffset(); - assert(current_manifest_file_size != 0); - } - - if (s.ok()) { - if (!version_edit_params.has_next_file_number_) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!version_edit_params.has_log_number_) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!version_edit_params.has_last_sequence_) { - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!version_edit_params.has_prev_log_number_) { - version_edit_params.SetPrevLogNumber(0); - } - - column_family_set_->UpdateMaxColumnFamily( - version_edit_params.max_column_family_); - - // When reading DB generated using old release, min_log_number_to_keep=0. - // All log files will be scanned for potential prepare entries. - MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_); - MarkFileNumberUsed(version_edit_params.prev_log_number_); - MarkFileNumberUsed(version_edit_params.log_number_); - } - - // there were some column families in the MANIFEST that weren't specified - // in the argument. This is OK in read_only mode - if (read_only == false && !column_families_not_found.empty()) { - std::string list_of_not_found; - for (const auto& cf : column_families_not_found) { - list_of_not_found += ", " + cf.second; - } - list_of_not_found = list_of_not_found.substr(2); - s = Status::InvalidArgument( - "You have to open all column families. Column families not opened: " + - list_of_not_found); - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - assert(builders.count(cfd->GetID()) > 0); - auto* builder = builders[cfd->GetID()]->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } + VersionEditHandler handler(read_only, column_families, + const_cast(this), + /*track_missing_files=*/false, + /*no_error_if_files_missing=*/false, io_tracer_); + handler.Iterate(reader, &log_read_status); + s = handler.status(); + if (s.ok()) { + log_number = handler.GetVersionEditParams().log_number_; + current_manifest_file_size = reader.GetReadOffset(); + assert(current_manifest_file_size != 0); + handler.GetDbId(db_id); } } if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - if (read_only) { - cfd->table_cache()->SetTablesAreImmortal(); - } - assert(cfd->initialized()); - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto builder = builders_iter->second->version_builder(); - - // unlimited table cache. Pre-load table handle now. - // Need to do it out of the mutex. - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - true /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); - if (!s.ok()) { - if (db_options_->paranoid_checks) { - return s; - } - s = Status::OK(); - } - - Version* v = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, - current_version_number_++); - s = builder->SaveTo(v->storage_info()); - if (!s.ok()) { - delete v; - return s; - } - - // Install recovered version - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), - !(db_options_->skip_stats_update_on_db_open)); - AppendVersion(cfd, v); - } - manifest_file_size_ = current_manifest_file_size; - next_file_number_.store(version_edit_params.next_file_number_ + 1); - last_allocated_sequence_ = version_edit_params.last_sequence_; - last_published_sequence_ = version_edit_params.last_sequence_; - last_sequence_ = version_edit_params.last_sequence_; - prev_log_number_ = version_edit_params.prev_log_number_; - ROCKS_LOG_INFO( db_options_->info_log, "Recovered from manifest file:%s succeeded," @@ -4751,9 +4624,8 @@ Status VersionSet::Recover( ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), - last_sequence_.load(), version_edit_params.log_number_, - prev_log_number_, column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + last_sequence_.load(), log_number, prev_log_number_, + column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { @@ -4901,7 +4773,9 @@ Status VersionSet::TryRecoverFromOneManifest( VersionEditHandlerPointInTime handler_pit( read_only, column_families, const_cast(this), io_tracer_); - handler_pit.Iterate(reader, &s, db_id); + handler_pit.Iterate(reader, &s); + + handler_pit.GetDbId(db_id); assert(nullptr != has_missing_table_file); *has_missing_table_file = handler_pit.HasMissingFiles(); @@ -4912,7 +4786,7 @@ Status VersionSet::TryRecoverFromOneManifest( Status VersionSet::ListColumnFamilies(std::vector* column_families, const std::string& dbname, FileSystem* fs) { - // these are just for performance reasons, not correcntes, + // these are just for performance reasons, not correctness, // so we're fine using the defaults FileOptions soptions; // Read "CURRENT" file, which contains a pointer to the current manifest file @@ -4935,48 +4809,23 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, nullptr /*IOTracer*/)); } - std::map column_family_names; - // default column family is always implicitly there - column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - if (edit.is_column_family_add_) { - if (column_family_names.find(edit.column_family_) != - column_family_names.end()) { - s = Status::Corruption("Manifest adding the same column family twice"); - break; - } - column_family_names.insert( - {edit.column_family_, edit.column_family_name_}); - } else if (edit.is_column_family_drop_) { - if (column_family_names.find(edit.column_family_) == - column_family_names.end()) { - s = Status::Corruption( - "Manifest - dropping non-existing column family"); - break; - } - column_family_names.erase(edit.column_family_); - } - } + ListColumnFamiliesHandler handler; + handler.Iterate(reader, &s); + + assert(column_families); column_families->clear(); - if (s.ok()) { - for (const auto& iter : column_family_names) { + if (handler.status().ok()) { + for (const auto& iter : handler.GetColumnFamilyNames()) { column_families->push_back(iter.second); } } - return s; + return handler.status(); } #ifndef ROCKSDB_LITE @@ -4996,7 +4845,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, WriteController wc(options->delayed_write_rate); WriteBufferManager wb(options->db_write_buffer_size); VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, - nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/); + nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, + /*db_session_id*/ ""); Status status; std::vector dummy; @@ -5078,7 +4928,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, } // Get the checksum information including the checksum and checksum function -// name of all SST files in VersionSet. Store the information in +// name of all SST and blob files in VersionSet. Store the information in // FileChecksumList which contains a map from file number to its checksum info. // If DB is not running, make sure call VersionSet::Recover() to load the file // metadata from Manifest to VersionSet before calling this function. @@ -5095,6 +4945,7 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { if (cfd->IsDropped() || !cfd->initialized()) { continue; } + /* SST files */ for (int level = 0; level < cfd->NumberLevels(); level++) { for (const auto& file : cfd->current()->storage_info()->LevelFiles(level)) { @@ -5102,17 +4953,36 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { file->file_checksum, file->file_checksum_func_name); if (!s.ok()) { - break; + return s; } } + } + + /* Blob files */ + const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles(); + for (const auto& pair : blob_files) { + const uint64_t blob_file_number = pair.first; + const auto& meta = pair.second; + + assert(meta); + assert(blob_file_number == meta->GetBlobFileNumber()); + + std::string checksum_value = meta->GetChecksumValue(); + std::string checksum_method = meta->GetChecksumMethod(); + assert(checksum_value.empty() == checksum_method.empty()); + if (meta->GetChecksumMethod().empty()) { + checksum_value = kUnknownFileChecksum; + checksum_method = kUnknownFileChecksumFuncName; + } + + s = checksum_list->InsertOneFileChecksum(blob_file_number, checksum_value, + checksum_method); if (!s.ok()) { - break; + return s; } } - if (!s.ok()) { - break; - } } + return s; } @@ -5135,194 +5005,19 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, std::move(file), dscname, db_options_->log_readahead_size, io_tracer_)); } - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t previous_log_number = 0; - int count = 0; - std::unordered_map comparators; - std::unordered_map> - builders; - - // add default column family - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); - + std::vector column_families( + 1, ColumnFamilyDescriptor(kDefaultColumnFamilyName, options)); + DumpManifestHandler handler(column_families, this, io_tracer_, verbose, hex, + json); { VersionSet::LogReporter reporter; reporter.status = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - // Write out each individual edit - if (verbose && !json) { - printf("%s\n", edit.DebugString(hex).c_str()); - } else if (json) { - printf("%s\n", edit.DebugJSON(count, hex).c_str()); - } - count++; - - bool cf_in_builders = - builders.find(edit.column_family_) != builders.end(); - - if (edit.has_comparator_) { - comparators.insert({edit.column_family_, edit.comparator_}); - } - - ColumnFamilyData* cfd = nullptr; - - if (edit.is_column_family_add_) { - if (cf_in_builders) { - s = Status::Corruption( - "Manifest adding the same column family twice"); - break; - } - cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); - cfd->set_initialized(); - builders.insert(std::make_pair( - edit.column_family_, std::unique_ptr( - new BaseReferencedVersionBuilder(cfd)))); - } else if (edit.is_column_family_drop_) { - if (!cf_in_builders) { - s = Status::Corruption( - "Manifest - dropping non-existing column family"); - break; - } - auto builder_iter = builders.find(edit.column_family_); - builders.erase(builder_iter); - comparators.erase(edit.column_family_); - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - assert(cfd != nullptr); - cfd->UnrefAndTryDelete(); - cfd = nullptr; - } else { - if (!cf_in_builders) { - s = Status::Corruption( - "Manifest record referencing unknown column family"); - break; - } - - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // this should never happen since cf_in_builders is true - assert(cfd != nullptr); - - // if it is not column family add or column family drop, - // then it's a file add/delete, which should be forwarded - // to builder - auto builder = builders.find(edit.column_family_); - assert(builder != builders.end()); - s = builder->second->version_builder()->Apply(&edit); - if (!s.ok()) { - break; - } - } - - if (cfd != nullptr && edit.has_log_number_) { - cfd->SetLogNumber(edit.log_number_); - } - - - if (edit.has_prev_log_number_) { - previous_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; - } - - if (edit.has_max_column_family_) { - column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_); - } - - if (edit.has_min_log_number_to_keep_) { - MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_); - } - } + handler.Iterate(reader, &s); } - file_reader.reset(); - - if (s.ok()) { - if (!have_next_file) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - printf("no meta-nextfile entry in descriptor"); - } else if (!have_last_sequence) { - printf("no last-sequence-number entry in descriptor"); - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - if (!have_prev_log_number) { - previous_log_number = 0; - } - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto builder = builders_iter->second->version_builder(); - - Version* v = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, - current_version_number_++); - s = builder->SaveTo(v->storage_info()); - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false); - - printf("--------------- Column family \"%s\" (ID %" PRIu32 - ") --------------\n", - cfd->GetName().c_str(), cfd->GetID()); - printf("log number: %" PRIu64 "\n", cfd->GetLogNumber()); - auto comparator = comparators.find(cfd->GetID()); - if (comparator != comparators.end()) { - printf("comparator: %s\n", comparator->second.c_str()); - } else { - printf("comparator: \n"); - } - printf("%s \n", v->DebugString(hex).c_str()); - delete v; - } - - next_file_number_.store(next_file + 1); - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - prev_log_number_ = previous_log_number; - - printf("next_file_number %" PRIu64 " last_sequence %" PRIu64 - " prev_log_number %" PRIu64 " max_column_family %" PRIu32 - " min_log_number_to_keep " - "%" PRIu64 "\n", - next_file_number_.load(), last_sequence, previous_log_number, - column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); - } - - return s; + return handler.status(); } #endif // ROCKSDB_LITE @@ -5343,7 +5038,7 @@ void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) { Status VersionSet::WriteCurrentStateToManifest( const std::unordered_map& curr_state, - log::Writer* log, IOStatus& io_s) { + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) { // TODO: Break up into multiple records to reduce memory usage on recovery? // WARNING: This method doesn't hold a mutex!! @@ -5368,6 +5063,21 @@ Status VersionSet::WriteCurrentStateToManifest( } } + // Save WALs. + if (!wal_additions.GetWalAdditions().empty()) { + TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal", + const_cast(&wal_additions)); + std::string record; + if (!wal_additions.EncodeTo(&record)) { + return Status::Corruption("Unable to Encode VersionEdit: " + + wal_additions.DebugString(true)); + } + io_s = log->AddRecord(record); + if (!io_s.ok()) { + return io_s; + } + } + for (auto cfd : *column_family_set_) { assert(cfd); @@ -5438,6 +5148,21 @@ Status VersionSet::WriteCurrentStateToManifest( assert(iter != curr_state.end()); uint64_t log_number = iter->second.log_number; edit.SetLogNumber(log_number); + + if (cfd->GetID() == 0) { + // min_log_number_to_keep is for the whole db, not for specific column family. + // So it does not need to be set for every column family, just need to be set once. + // Since default CF can never be dropped, we set the min_log to the default CF here. + uint64_t min_log = min_log_number_to_keep_2pc(); + if (min_log != 0) { + edit.SetMinLogNumberToKeep(min_log); + } + } + + const std::string& full_history_ts_low = iter->second.full_history_ts_low; + if (!full_history_ts_low.empty()) { + edit.SetFullHistoryTsLow(full_history_ts_low); + } std::string record; if (!edit.EncodeTo(&record)) { return Status::Corruption( @@ -5785,20 +5510,6 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { "[%s] compaction output being applied to a different base version from" " input version", c->column_family_data()->GetName().c_str()); - - if (vstorage->compaction_style_ == kCompactionStyleLevel && - c->start_level() == 0 && c->num_input_levels() > 2U) { - // We are doing a L0->base_level compaction. The assumption is if - // base level is not L1, levels from L1 to base_level - 1 is empty. - // This is ensured by having one compaction from L0 going on at the - // same time in level-based compaction. So that during the time, no - // compaction/flush can put files to those levels. - for (int l = c->start_level() + 1; l < c->output_level(); l++) { - if (vstorage->NumLevelFiles(l) != 0) { - return false; - } - } - } } for (size_t input = 0; input < c->num_input_levels(); ++input) { @@ -5881,6 +5592,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; filemetadata.file_checksum = file->file_checksum; filemetadata.file_checksum_func_name = file->file_checksum_func_name; + filemetadata.temperature = file->temperature; + filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); + filemetadata.file_creation_time = file->TryGetFileCreationTime(); metadata->push_back(filemetadata); } } @@ -5996,8 +5710,8 @@ ReactiveVersionSet::ReactiveVersionSet( const std::shared_ptr& io_tracer) : VersionSet(dbname, _db_options, _file_options, table_cache, write_buffer_manager, write_controller, - /*block_cache_tracer=*/nullptr, io_tracer), - number_of_edits_to_skip_(0) {} + /*block_cache_tracer=*/nullptr, io_tracer, + /*db_session_id*/ "") {} ReactiveVersionSet::~ReactiveVersionSet() {} @@ -6010,394 +5724,44 @@ Status ReactiveVersionSet::Recover( assert(manifest_reporter != nullptr); assert(manifest_reader_status != nullptr); - std::unordered_map cf_name_to_options; - for (const auto& cf : column_families) { - cf_name_to_options.insert({cf.name, cf.options}); - } - - // add default column family - auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); - if (default_cf_iter == cf_name_to_options.end()) { - return Status::InvalidArgument("Default column family not specified"); - } - VersionEdit default_cf_edit; - default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName); - default_cf_edit.SetColumnFamily(0); - ColumnFamilyData* default_cfd = - CreateColumnFamily(default_cf_iter->second, &default_cf_edit); - // In recovery, nobody else can access it, so it's fine to set it to be - // initialized earlier. - default_cfd->set_initialized(); - VersionBuilderMap builders; - std::unordered_map column_families_not_found; - builders.insert( - std::make_pair(0, std::unique_ptr( - new BaseReferencedVersionBuilder(default_cfd)))); - manifest_reader_status->reset(new Status()); manifest_reporter->reset(new LogReporter()); static_cast_with_check(manifest_reporter->get())->status = manifest_reader_status->get(); Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader); log::Reader* reader = manifest_reader->get(); + assert(reader); - int retry = 0; - VersionEdit version_edit; - while (s.ok() && retry < 1) { - assert(reader != nullptr); - s = ReadAndRecover(*reader, &read_buffer_, cf_name_to_options, - column_families_not_found, builders, - manifest_reader_status->get(), &version_edit); - if (s.ok()) { - bool enough = version_edit.has_next_file_number_ && - version_edit.has_log_number_ && - version_edit.has_last_sequence_; - if (enough) { - for (const auto& cf : column_families) { - auto cfd = column_family_set_->GetColumnFamily(cf.name); - if (cfd == nullptr) { - enough = false; - break; - } - } - } - if (enough) { - for (const auto& cf : column_families) { - auto cfd = column_family_set_->GetColumnFamily(cf.name); - assert(cfd != nullptr); - if (!cfd->IsDropped()) { - auto builder_iter = builders.find(cfd->GetID()); - assert(builder_iter != builders.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - true /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); - if (!s.ok()) { - enough = false; - if (s.IsPathNotFound()) { - s = Status::OK(); - } - break; - } - } - } - } - if (enough) { - break; - } - } - ++retry; - } - - if (s.ok()) { - if (!version_edit.has_prev_log_number_) { - version_edit.prev_log_number_ = 0; - } - column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_); - - MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_); - MarkFileNumberUsed(version_edit.prev_log_number_); - MarkFileNumberUsed(version_edit.log_number_); + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_)); - for (auto cfd : *column_family_set_) { - assert(builders.count(cfd->GetID()) > 0); - auto builder = builders[cfd->GetID()]->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } - } - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - assert(cfd->initialized()); - auto builders_iter = builders.find(cfd->GetID()); - assert(builders_iter != builders.end()); - auto* builder = builders_iter->second->version_builder(); + manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); - Version* v = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, - current_version_number_++); - s = builder->SaveTo(v->storage_info()); - - if (s.ok()) { - // Install recovered version - v->PrepareApply(*cfd->GetLatestMutableCFOptions(), - !(db_options_->skip_stats_update_on_db_open)); - AppendVersion(cfd, v); - } else { - ROCKS_LOG_ERROR(db_options_->info_log, - "[%s]: inconsistent version: %s\n", - cfd->GetName().c_str(), s.ToString().c_str()); - delete v; - break; - } - } - } - if (s.ok()) { - next_file_number_.store(version_edit.next_file_number_ + 1); - last_allocated_sequence_ = version_edit.last_sequence_; - last_published_sequence_ = version_edit.last_sequence_; - last_sequence_ = version_edit.last_sequence_; - prev_log_number_ = version_edit.prev_log_number_; - for (auto cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - ROCKS_LOG_INFO(db_options_->info_log, - "Column family [%s] (ID %u), log number is %" PRIu64 "\n", - cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); - } - } - return s; + return manifest_tailer_->status(); } Status ReactiveVersionSet::ReadAndApply( InstrumentedMutex* mu, std::unique_ptr* manifest_reader, + Status* manifest_read_status, std::unordered_set* cfds_changed) { assert(manifest_reader != nullptr); assert(cfds_changed != nullptr); mu->AssertHeld(); Status s; - uint64_t applied_edits = 0; - while (s.ok()) { - Slice record; - std::string scratch; - log::Reader* reader = manifest_reader->get(); - std::string old_manifest_path = reader->file()->file_name(); - while (reader->ReadRecord(&record, &scratch)) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - // Skip the first VersionEdits of each MANIFEST generated by - // VersionSet::WriteCurrentStatetoManifest. - if (number_of_edits_to_skip_ > 0) { - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - if (cfd != nullptr && !cfd->IsDropped()) { - --number_of_edits_to_skip_; - } - continue; - } - - s = read_buffer_.AddEdit(&edit); - if (!s.ok()) { - break; - } - VersionEdit temp_edit; - if (edit.is_in_atomic_group_) { - if (read_buffer_.IsFull()) { - // Apply edits in an atomic group when we have read all edits in the - // group. - for (auto& e : read_buffer_.replay_buffer()) { - s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit); - if (!s.ok()) { - break; - } - applied_edits++; - } - if (!s.ok()) { - break; - } - read_buffer_.Clear(); - } - } else { - // Apply a normal edit immediately. - s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit); - if (s.ok()) { - applied_edits++; - } else { - break; - } - } - } - if (!s.ok()) { - // Clear the buffer if we fail to decode/apply an edit. - read_buffer_.Clear(); - } - // It's possible that: - // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. - // Or the version(s) rebuilt from tailing the MANIFEST is inconsistent. - // 2) we have finished reading the current MANIFEST. - // 3) we have encountered an IOError reading the current MANIFEST. - // We need to look for the next MANIFEST and start from there. If we cannot - // find the next MANIFEST, we should exit the loop. - Status tmp_s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); - reader = manifest_reader->get(); - if (tmp_s.ok()) { - if (reader->file()->file_name() == old_manifest_path) { - // Still processing the same MANIFEST, thus no need to continue this - // loop since no record is available if we have reached here. - break; - } else { - // We have switched to a new MANIFEST whose first records have been - // generated by VersionSet::WriteCurrentStatetoManifest. Since the - // secondary instance has already finished recovering upon start, there - // is no need for the secondary to process these records. Actually, if - // the secondary were to replay these records, the secondary may end up - // adding the same SST files AGAIN to each column family, causing - // consistency checks done by VersionBuilder to fail. Therefore, we - // record the number of records to skip at the beginning of the new - // MANIFEST and ignore them. - number_of_edits_to_skip_ = 0; - for (auto* cfd : *column_family_set_) { - if (cfd->IsDropped()) { - continue; - } - // Increase number_of_edits_to_skip by 2 because - // WriteCurrentStatetoManifest() writes 2 version edits for each - // column family at the beginning of the newly-generated MANIFEST. - // TODO(yanqin) remove hard-coded value. - if (db_options_->write_dbid_to_manifest) { - number_of_edits_to_skip_ += 3; - } else { - number_of_edits_to_skip_ += 2; - } - } - s = tmp_s; - } - } - } - - if (s.ok()) { - for (auto cfd : *column_family_set_) { - auto builder_iter = active_version_builders_.find(cfd->GetID()); - if (builder_iter == active_version_builders_.end()) { - continue; - } - auto builder = builder_iter->second->version_builder(); - if (!builder->CheckConsistencyForNumLevels()) { - s = Status::InvalidArgument( - "db has more levels than options.num_levels"); - break; - } - } - } - TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits", - &applied_edits); - return s; -} - -Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, std::unordered_set* cfds_changed, - VersionEdit* version_edit) { - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - - // If we cannot find this column family in our column family set, then it - // may be a new column family created by the primary after the secondary - // starts. It is also possible that the secondary instance opens only a subset - // of column families. Ignore it for now. - if (nullptr == cfd) { - return Status::OK(); - } - if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end() && - !cfd->IsDropped()) { - std::unique_ptr builder_guard( - new BaseReferencedVersionBuilder(cfd)); - active_version_builders_.insert( - std::make_pair(edit.column_family_, std::move(builder_guard))); - } - - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - - if (edit.is_column_family_add_) { - // TODO (yanqin) for now the secondary ignores column families created - // after Open. This also simplifies handling of switching to a new MANIFEST - // and processing the snapshot of the system at the beginning of the - // MANIFEST. - } else if (edit.is_column_family_drop_) { - // Drop the column family by setting it to be 'dropped' without destroying - // the column family handle. - // TODO (haoyu) figure out how to handle column faimly drop for - // secondary instance. (Is it possible that the ref count for cfd is 0 but - // the ref count for its versions is higher than 0?) - cfd->SetDropped(); - if (cfd->UnrefAndTryDelete()) { - cfd = nullptr; - } - active_version_builders_.erase(builder_iter); - } else { - Status s = builder->Apply(&edit); - if (!s.ok()) { - return s; - } - } - Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit); + log::Reader* reader = manifest_reader->get(); + assert(reader); + s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); if (!s.ok()) { return s; } - - if (cfd != nullptr && !cfd->IsDropped()) { - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); - TEST_SYNC_POINT_CALLBACK( - "ReactiveVersionSet::ApplyOneVersionEditToBuilder:" - "AfterLoadTableHandlers", - &s); - - if (s.ok()) { - auto version = new Version(cfd, this, file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, - current_version_number_++); - s = builder->SaveTo(version->storage_info()); - if (s.ok()) { - version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); - AppendVersion(cfd, version); - active_version_builders_.erase(builder_iter); - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); - } - } else { - delete version; - } - } else if (s.IsPathNotFound()) { - s = Status::OK(); - } - // Some other error has occurred during LoadTableHandlers. - } - + manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status); + s = manifest_tailer_->status(); if (s.ok()) { - if (version_edit->HasNextFile()) { - next_file_number_.store(version_edit->next_file_number_ + 1); - } - if (version_edit->has_last_sequence_) { - last_allocated_sequence_ = version_edit->last_sequence_; - last_published_sequence_ = version_edit->last_sequence_; - last_sequence_ = version_edit->last_sequence_; - } - if (version_edit->has_prev_log_number_) { - prev_log_number_ = version_edit->prev_log_number_; - MarkFileNumberUsed(version_edit->prev_log_number_); - } - if (version_edit->has_log_number_) { - MarkFileNumberUsed(version_edit->log_number_); - } - column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_); - MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_); + *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies()); } + return s; } @@ -6421,7 +5785,7 @@ Status ReactiveVersionSet::MaybeSwitchManifest( "ReactiveVersionSet::MaybeSwitchManifest:" "AfterGetCurrentManifestPath:1"); s = fs_->NewSequentialFile(manifest_path, - env_->OptimizeForManifestRead(file_options_), + fs_->OptimizeForManifestRead(file_options_), &manifest_file, nullptr); } else { // No need to switch manifest. @@ -6438,15 +5802,24 @@ Status ReactiveVersionSet::MaybeSwitchManifest( true /* checksum */, 0 /* log_number */)); ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n", manifest_path.c_str()); - // TODO (yanqin) every time we switch to a new MANIFEST, we clear the - // active_version_builders_ map because we choose to construct the - // versions from scratch, thanks to the first part of each MANIFEST - // written by VersionSet::WriteCurrentStatetoManifest. This is not - // necessary, but we choose this at present for the sake of simplicity. - active_version_builders_.clear(); + if (manifest_tailer_) { + manifest_tailer_->PrepareToReadNewManifest(); + } } } while (s.IsPathNotFound()); return s; } +#ifndef NDEBUG +uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group(); +} +#endif // !NDEBUG + +std::vector& ReactiveVersionSet::replay_buffer() { + assert(manifest_tailer_); + return manifest_tailer_->GetReadBuffer().replay_buffer(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_set.h b/db/version_set.h index 93b4509611d..4a593e97cdf 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -26,9 +26,11 @@ #include #include #include +#include #include #include +#include "cache/cache_helpers.h" #include "db/blob/blob_file_meta.h" #include "db/column_family.h" #include "db/compaction/compaction.h" @@ -58,6 +60,7 @@ namespace log { class Writer; } +class BlobIndex; class Compaction; class LogBuffer; class LookupKey; @@ -68,6 +71,8 @@ class WriteBufferManager; class MergeContext; class ColumnFamilySet; class MergeIteratorBuilder; +class SystemClock; +class ManifestTailer; // VersionEdit is always supposed to be valid and it is used to point at // entries in Manifest. Ideally it should not be used as a container to @@ -145,7 +150,7 @@ class VersionStorageInfo { // We use compaction scores to figure out which compaction to do next // REQUIRES: db_mutex held!! // TODO find a better way to pass compaction_options_fifo. - void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options, + void ComputeCompactionScore(const ImmutableOptions& immutable_options, const MutableCFOptions& mutable_cf_options); // Estimate est_comp_needed_bytes_ @@ -158,13 +163,13 @@ class VersionStorageInfo { // This computes ttl_expired_files_ and is called by // ComputeCompactionScore() - void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions, + void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions, const uint64_t ttl); // This computes files_marked_for_periodic_compaction_ and is called by // ComputeCompactionScore() void ComputeFilesMarkedForPeriodicCompaction( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const uint64_t periodic_compaction_seconds); // This computes bottommost_files_marked_for_compaction_ and is called by @@ -340,6 +345,19 @@ class VersionStorageInfo { using BlobFiles = std::map>; const BlobFiles& GetBlobFiles() const { return blob_files_; } + uint64_t GetTotalBlobFileSize() const { + uint64_t total_blob_bytes = 0; + + for (const auto& pair : blob_files_) { + const auto& meta = pair.second; + assert(meta); + + total_blob_bytes += meta->GetTotalBlobBytes(); + } + + return total_blob_bytes; + } + const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const { assert(level < static_cast(level_files_brief_.size())); return level_files_brief_[level]; @@ -464,7 +482,7 @@ class VersionStorageInfo { uint64_t MaxBytesForLevel(int level) const; // Must be called after any change to MutableCFOptions. - void CalculateBaseBytes(const ImmutableCFOptions& ioptions, + void CalculateBaseBytes(const ImmutableOptions& ioptions, const MutableCFOptions& options); // Returns an estimate of the amount of live data in bytes. @@ -679,7 +697,21 @@ class Version { bool* is_blob = nullptr, bool do_merge = true); void MultiGet(const ReadOptions&, MultiGetRange* range, - ReadCallback* callback = nullptr, bool* is_blob = nullptr); + ReadCallback* callback = nullptr); + + // Interprets blob_index_slice as a blob reference, and (assuming the + // corresponding blob file is part of this Version) retrieves the blob and + // saves it in *value. + // REQUIRES: blob_index_slice stores an encoded blob reference + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const Slice& blob_index_slice, PinnableSlice* value, + uint64_t* bytes_read) const; + + // Retrieves a blob using a blob reference and saves it in *value, + // assuming the corresponding blob file is part of this Version. + Status GetBlob(const ReadOptions& read_options, const Slice& user_key, + const BlobIndex& blob_index, PinnableSlice* value, + uint64_t* bytes_read) const; // Loads some stats information from files. Call without mutex held. It needs // to be called before applying the version to the version set. @@ -741,10 +773,8 @@ class Version { ColumnFamilyData* cfd() const { return cfd_; } - // Return the next Version in the linked list. Used for debug only - Version* TEST_Next() const { - return next_; - } + // Return the next Version in the linked list. + Version* Next() const { return next_; } int TEST_refs() const { return refs_; } @@ -764,6 +794,8 @@ class Version { private: Env* env_; + SystemClock* clock_; + friend class ReactiveVersionSet; friend class VersionSet; friend class VersionEditHandler; @@ -800,6 +832,7 @@ class Version { Logger* info_log_; Statistics* db_statistics_; TableCache* table_cache_; + BlobFileCache* blob_file_cache_; const MergeOperator* merge_operator_; VersionStorageInfo storage_info_; @@ -876,6 +909,7 @@ class BaseReferencedVersionBuilder; class AtomicGroupReadBuffer { public: + AtomicGroupReadBuffer() = default; Status AddEdit(VersionEdit* edit); void Clear(); bool IsFull() const; @@ -901,13 +935,25 @@ class VersionSet { WriteBufferManager* write_buffer_manager, WriteController* write_controller, BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer); + const std::shared_ptr& io_tracer, + const std::string& db_session_id); // No copying allowed VersionSet(const VersionSet&) = delete; void operator=(const VersionSet&) = delete; virtual ~VersionSet(); + Status LogAndApplyToDefaultColumnFamily( + VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr) { + ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); + const MutableCFOptions* cf_options = + default_cf->GetLatestMutableCFOptions(); + return LogAndApply(default_cf, *cf_options, edit, mu, db_directory, + new_descriptor_log, column_family_options); + } + // Apply *edit to the current version to form a new descriptor that // is both saved to persistent state and installed as the new // current version. Will release *mu while actually writing to the file. @@ -938,7 +984,8 @@ class VersionSet { const MutableCFOptions& mutable_cf_options, const autovector& edit_list, InstrumentedMutex* mu, FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, - const ColumnFamilyOptions* column_family_options = nullptr) { + const ColumnFamilyOptions* column_family_options = nullptr, + const std::function& manifest_wcb = {}) { autovector cfds; cfds.emplace_back(column_family_data); autovector mutable_cf_options_list; @@ -946,7 +993,8 @@ class VersionSet { autovector> edit_lists; edit_lists.emplace_back(edit_list); return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - db_directory, new_descriptor_log, column_family_options); + db_directory, new_descriptor_log, column_family_options, + {manifest_wcb}); } // The across-multi-cf batch version. If edit_lists contain more than @@ -958,7 +1006,9 @@ class VersionSet { const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* db_directory = nullptr, bool new_descriptor_log = false, - const ColumnFamilyOptions* new_cf_options = nullptr); + const ColumnFamilyOptions* new_cf_options = nullptr, + const std::vector>& manifest_wcbs = + {}); static Status GetCurrentManifestPath(const std::string& dbname, FileSystem* fs, @@ -1096,10 +1146,28 @@ class VersionSet { return PreComputeMinLogNumberWithUnflushedData(nullptr); } // Returns the minimum log number which still has data not flushed to any SST + // file. + // Empty column families' log number is considered to be + // new_log_number_for_empty_cf. + uint64_t PreComputeMinLogNumberWithUnflushedData( + uint64_t new_log_number_for_empty_cf) const { + uint64_t min_log_num = port::kMaxUint64; + for (auto cfd : *column_family_set_) { + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + uint64_t num = + cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber(); + if (min_log_num > num && !cfd->IsDropped()) { + min_log_num = num; + } + } + return min_log_num; + } + // Returns the minimum log number which still has data not flushed to any SST // file, except data from `cfd_to_skip`. uint64_t PreComputeMinLogNumberWithUnflushedData( const ColumnFamilyData* cfd_to_skip) const { - uint64_t min_log_num = std::numeric_limits::max(); + uint64_t min_log_num = port::kMaxUint64; for (auto cfd : *column_family_set_) { if (cfd == cfd_to_skip) { continue; @@ -1112,6 +1180,23 @@ class VersionSet { } return min_log_num; } + // Returns the minimum log number which still has data not flushed to any SST + // file, except data from `cfds_to_skip`. + uint64_t PreComputeMinLogNumberWithUnflushedData( + const std::unordered_set& cfds_to_skip) const { + uint64_t min_log_num = port::kMaxUint64; + for (auto cfd : *column_family_set_) { + if (cfds_to_skip.count(cfd)) { + continue; + } + // It's safe to ignore dropped column families here: + // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. + if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) { + min_log_num = cfd->GetLogNumber(); + } + } + return min_log_num; + } // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. @@ -1150,6 +1235,10 @@ class VersionSet { void GetLiveFilesMetaData(std::vector *metadata); void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) { + assert(table_cache_); + + table_cache_->Erase(GetSlice(&blob_file_number)); + obsolete_blob_files_.emplace_back(blob_file_number, std::move(path)); } @@ -1174,6 +1263,7 @@ class VersionSet { // Get the IO Status returned by written Manifest. const IOStatus& io_status() const { return io_status_; } + // The returned WalSet needs to be accessed with DB mutex held. const WalSet& GetWalSet() const { return wals_; } void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) { @@ -1198,6 +1288,7 @@ class VersionSet { friend class Version; friend class VersionEditHandler; friend class VersionEditHandlerPointInTime; + friend class DumpManifestHandler; friend class DBImpl; friend class DBImplReadOnly; friend class ManifestReader; @@ -1225,54 +1316,39 @@ class VersionSet { struct MutableCFState { uint64_t log_number; + std::string full_history_ts_low; + + explicit MutableCFState() = default; + explicit MutableCFState(uint64_t _log_number, std::string ts_low) + : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {} }; // Save current contents to *log Status WriteCurrentStateToManifest( const std::unordered_map& curr_state, - log::Writer* log, IOStatus& io_s); + const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s); void AppendVersion(ColumnFamilyData* column_family_data, Version* v); ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, const VersionEdit* edit); - Status ReadAndRecover( - log::Reader& reader, AtomicGroupReadBuffer* read_buffer, - const std::unordered_map& - name_to_options, - std::unordered_map& column_families_not_found, - std::unordered_map< - uint32_t, std::unique_ptr>& builders, - Status* log_read_status, VersionEditParams* version_edit, - std::string* db_id = nullptr); - - // REQUIRES db mutex - Status ApplyOneVersionEditToBuilder( - VersionEdit& edit, - const std::unordered_map& name_to_opts, - std::unordered_map& column_families_not_found, - std::unordered_map< - uint32_t, std::unique_ptr>& builders, - VersionEditParams* version_edit); - - Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, - const VersionEdit& from_edit, - VersionEditParams* version_edit_params); - Status VerifyFileMetadata(const std::string& fpath, const FileMetaData& meta) const; + // Protected by DB mutex. WalSet wals_; std::unique_ptr column_family_set_; + Cache* table_cache_; Env* const env_; FileSystemPtr const fs_; + SystemClock* const clock_; const std::string dbname_; std::string db_id_; const ImmutableDBOptions* const db_options_; std::atomic next_file_number_; - // Any log number equal or lower than this should be ignored during recovery, + // Any WAL number smaller than this should be ignored during recovery, // and is qualified for being deleted in 2PC mode. In non-2PC mode, this // number is ignored. std::atomic min_log_number_to_keep_2pc_ = {0}; @@ -1322,6 +1398,8 @@ class VersionSet { std::shared_ptr io_tracer_; + std::string db_session_id_; + private: // REQUIRES db mutex at beginning. may release and re-acquire db mutex Status ProcessManifestWrites(std::deque& writers, @@ -1352,23 +1430,20 @@ class ReactiveVersionSet : public VersionSet { Status ReadAndApply( InstrumentedMutex* mu, std::unique_ptr* manifest_reader, + Status* manifest_read_status, std::unordered_set* cfds_changed); Status Recover(const std::vector& column_families, std::unique_ptr* manifest_reader, std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status); +#ifndef NDEBUG + uint64_t TEST_read_edits_in_atomic_group() const; +#endif //! NDEBUG - uint64_t TEST_read_edits_in_atomic_group() const { - return read_buffer_.TEST_read_edits_in_atomic_group(); - } - std::vector& replay_buffer() { - return read_buffer_.replay_buffer(); - } + std::vector& replay_buffer(); protected: - using VersionSet::ApplyOneVersionEditToBuilder; - // REQUIRES db mutex Status ApplyOneVersionEditToBuilder( VersionEdit& edit, std::unordered_set* cfds_changed, @@ -1379,11 +1454,7 @@ class ReactiveVersionSet : public VersionSet { std::unique_ptr* manifest_reader); private: - VersionBuilderMap active_version_builders_; - AtomicGroupReadBuffer read_buffer_; - // Number of version edits to skip by ReadAndApply at the beginning of a new - // MANIFEST created by primary. - int number_of_edits_to_skip_; + std::unique_ptr manifest_tailer_; using VersionSet::LogAndApply; using VersionSet::Recover; @@ -1393,8 +1464,9 @@ class ReactiveVersionSet : public VersionSet { const autovector& /*mutable_cf_options_list*/, const autovector>& /*edit_lists*/, InstrumentedMutex* /*mu*/, FSDirectory* /*db_directory*/, - bool /*new_descriptor_log*/, - const ColumnFamilyOptions* /*new_cf_option*/) override { + bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, + const std::vector>& /*manifest_wcbs*/) + override { return Status::NotSupported("not supported in reactive mode"); } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 8f2134dcee3..75919c6edc8 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -8,10 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_set.h" + #include "db/db_impl/db_impl.h" #include "db/log_writer.h" -#include "env/mock_env.h" -#include "logging/logging.h" +#include "rocksdb/convenience.h" +#include "rocksdb/file_system.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -102,7 +104,7 @@ class VersionStorageInfoTestBase : public testing::Test { InternalKeyComparator icmp_; std::shared_ptr logger_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions mutable_cf_options_; VersionStorageInfo vstorage_; @@ -692,44 +694,39 @@ class VersionSetTestBase { int num_initial_edits_; explicit VersionSetTestBase(const std::string& name) - : mem_env_(nullptr), - env_(nullptr), - env_guard_(), - fs_(), + : env_(nullptr), dbname_(test::PerThreadDBPath(name)), options_(), db_options_(options_), cf_options_(options_), - immutable_cf_options_(db_options_, cf_options_), + immutable_options_(db_options_, cf_options_), mutable_cf_options_(cf_options_), table_cache_(NewLRUCache(50000, 16)), write_buffer_manager_(db_options_.db_write_buffer_size), shutting_down_(false), mock_table_factory_(std::make_shared()) { - const char* test_env_uri = getenv("TEST_ENV_URI"); - Env* base_env = nullptr; - if (test_env_uri) { - Status s = Env::LoadEnv(test_env_uri, &base_env, &env_guard_); - EXPECT_OK(s); - EXPECT_NE(Env::Default(), base_env); - } else { - base_env = Env::Default(); - } - EXPECT_NE(nullptr, base_env); - if (getenv("MEM_ENV")) { - mem_env_ = new MockEnv(base_env); + EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_)); + if (env_ == Env::Default() && getenv("MEM_ENV")) { + env_guard_.reset(NewMemEnv(Env::Default())); + env_ = env_guard_.get(); } - env_ = mem_env_ ? mem_env_ : base_env; + EXPECT_NE(nullptr, env_); - fs_ = std::make_shared(env_); - EXPECT_OK(env_->CreateDirIfMissing(dbname_)); + fs_ = env_->GetFileSystem(); + EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr)); + options_.env = env_; db_options_.env = env_; db_options_.fs = fs_; + immutable_options_.env = env_; + immutable_options_.fs = fs_; + immutable_options_.clock = env_->GetSystemClock().get(); + versions_.reset( new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)); + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); reactive_versions_ = std::make_shared( dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, nullptr); @@ -745,10 +742,6 @@ class VersionSetTestBase { options.env = env_; EXPECT_OK(DestroyDB(dbname_, options)); } - if (mem_env_) { - delete mem_env_; - mem_env_ = nullptr; - } } protected: @@ -760,7 +753,9 @@ class VersionSetTestBase { assert(log_writer != nullptr); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - std::unique_ptr impl(new DBImpl(DBOptions(), dbname_)); + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; impl->GetDbIdentityFromIdentityFile(&db_id); new_db.SetDBId(db_id); @@ -787,13 +782,13 @@ class VersionSetTestBase { } *last_seqno = last_seq; num_initial_edits_ = static_cast(new_cfs.size() + 1); + std::unique_ptr file_writer; const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); { log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; @@ -816,21 +811,29 @@ class VersionSetTestBase { // Create DB with 3 column families. void NewDB() { - std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; SetIdentityFile(env_, dbname_); - PrepareManifest(&column_families, &last_seqno, &log_writer); + PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); // Make "CURRENT" file point to the new manifest file. Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr); ASSERT_OK(s); - EXPECT_OK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); } + void ReopenDB() { + versions_.reset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + EXPECT_OK(versions_->Recover(column_families_, false)); + } + void VerifyManifest(std::string* manifest_path) const { assert(manifest_path != nullptr); uint64_t manifest_file_number = 0; @@ -840,7 +843,63 @@ class VersionSetTestBase { ASSERT_EQ(1, manifest_file_number); } - MockEnv* mem_env_; + Status LogAndApplyToDefaultCF(VersionEdit& edit) { + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_); + mutex_.Unlock(); + return s; + } + + Status LogAndApplyToDefaultCF( + const autovector>& edits) { + autovector vedits; + for (auto& e : edits) { + vedits.push_back(e.get()); + } + mutex_.Lock(); + Status s = + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, vedits, &mutex_); + mutex_.Unlock(); + return s; + } + + void CreateNewManifest() { + constexpr FSDirectory* db_directory = nullptr; + constexpr bool new_descriptor_log = true; + mutex_.Lock(); + VersionEdit dummy; + ASSERT_OK(versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + &dummy, &mutex_, db_directory, new_descriptor_log)); + mutex_.Unlock(); + } + + ColumnFamilyData* CreateColumnFamily(const std::string& cf_name, + const ColumnFamilyOptions& cf_options) { + VersionEdit new_cf; + new_cf.AddColumnFamily(cf_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + new_cf.SetColumnFamily(new_id); + new_cf.SetLogNumber(0); + new_cf.SetComparatorName(cf_options.comparator->Name()); + Status s; + mutex_.Lock(); + s = versions_->LogAndApply(/*column_family_data=*/nullptr, + MutableCFOptions(cf_options), &new_cf, &mutex_, + /*db_directory=*/nullptr, + /*new_descriptor_log=*/false, &cf_options); + mutex_.Unlock(); + EXPECT_OK(s); + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(cf_name); + EXPECT_NE(nullptr, cfd); + return cfd; + } + + Env* mem_env_; Env* env_; std::shared_ptr env_guard_; std::shared_ptr fs_; @@ -849,7 +908,7 @@ class VersionSetTestBase { Options options_; ImmutableDBOptions db_options_; ColumnFamilyOptions cf_options_; - ImmutableCFOptions immutable_cf_options_; + ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; std::shared_ptr table_cache_; WriteController write_controller_; @@ -859,6 +918,7 @@ class VersionSetTestBase { InstrumentedMutex mutex_; std::atomic shutting_down_; std::shared_ptr mock_table_factory_; + std::vector column_families_; }; const std::string VersionSetTestBase::kColumnFamilyName1 = "alice"; @@ -979,17 +1039,8 @@ TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) { [&](void* /* arg */) { ++garbage_encoded; }); SyncPoint::GetInstance()->EnableProcessing(); - VersionEdit dummy; - - mutex_.Lock(); - constexpr FSDirectory* db_directory = nullptr; - constexpr bool new_descriptor_log = true; - Status s = versions_->LogAndApply( - versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - &dummy, &mutex_, db_directory, new_descriptor_log); - mutex_.Unlock(); + CreateNewManifest(); - ASSERT_OK(s); ASSERT_EQ(addition_encoded, 2); ASSERT_EQ(garbage_encoded, 1); @@ -1158,6 +1209,600 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) { } } +TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) { + NewDB(); + + constexpr uint64_t kNumWals = 5; + + autovector> edits; + // Add some WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + } + // Delete the first half of the WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals / 2 + 1); + + autovector versions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:NewVersion", + [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Since the edits are all WAL edits, no version should be created. + ASSERT_EQ(versions.size(), 1); + ASSERT_EQ(versions[0], nullptr); +} + +// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit. +TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) { + NewDB(); + + const std::string kDBId = "db_db"; + constexpr uint64_t kNumWals = 5; + + autovector> edits; + // Add some WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + } + // Delete the first half of the WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals / 2 + 1); + edits.emplace_back(new VersionEdit); + edits.back()->SetDBId(kDBId); + + autovector versions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:NewVersion", + [&](void* arg) { versions.push_back(reinterpret_cast(arg)); }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Since the edits are all WAL edits, no version should be created. + ASSERT_EQ(versions.size(), 1); + ASSERT_NE(versions[0], nullptr); +} + +TEST_F(VersionSetTest, WalAddition) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + // A WAL is just created. + { + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize()); + } + + // The WAL is synced for several times before closing. + { + for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) { + uint64_t size = kSizeInBytes - size_delta; + WalMetadata wal(size); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size); + } + } + + // The WAL is closed. + { + WalMetadata wal(kSizeInBytes); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } + + // Recover a new VersionSet. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } +} + +TEST_F(VersionSetTest, WalCloseWithoutSync) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2; + + // A WAL is just created. + { + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize()); + } + + // The WAL is synced before closing. + { + WalMetadata wal(kSyncedSizeInBytes); + VersionEdit edit; + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + } + + // A new WAL with larger log number is created, + // implicitly marking the current WAL closed. + { + VersionEdit edit; + edit.AddWal(kLogNumber + 1); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end()); + ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize()); + } + + // Recover a new VersionSet. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kLogNumber) != wals.end()); + ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes); + } +} + +TEST_F(VersionSetTest, WalDeletion) { + NewDB(); + + constexpr WalNumber kClosedLogNumber = 10; + constexpr WalNumber kNonClosedLogNumber = 20; + constexpr uint64_t kSizeInBytes = 111; + + // Add a non-closed and a closed WAL. + { + VersionEdit edit; + edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes)); + edit.AddWal(kNonClosedLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 2); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize()); + ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes); + } + + // Delete the closed WAL. + { + VersionEdit edit; + edit.DeleteWalsBefore(kNonClosedLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + const auto& wals = versions_->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } + + // Recover a new VersionSet, only the non-closed WAL should show up. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } + + // Force the creation of a new MANIFEST file, + // only the non-closed WAL should be written to the new MANIFEST. + { + std::vector wal_additions; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) { + VersionEdit* edit = reinterpret_cast(arg); + ASSERT_TRUE(edit->IsWalAddition()); + for (auto& addition : edit->GetWalAdditions()) { + wal_additions.push_back(addition); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateNewManifest(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + ASSERT_EQ(wal_additions.size(), 1); + ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber); + ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize()); + } + + // Recover from the new MANIFEST, only the non-closed WAL should show up. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end()); + ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize()); + } +} + +TEST_F(VersionSetTest, WalCreateTwice) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + + VersionEdit edit; + edit.AddWal(kLogNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") != + std::string::npos) + << s.ToString(); +} + +TEST_F(VersionSetTest, WalCreateAfterClose) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + edit.AddWal(kLogNumber); + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Create the same WAL again. + VersionEdit edit; + edit.AddWal(kLogNumber); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionSetTest, AddWalWithSmallerSize) { + NewDB(); + + constexpr WalNumber kLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Add the same WAL with smaller synced size. + VersionEdit edit; + WalMetadata wal(kSizeInBytes / 2); + edit.AddWal(kLogNumber, wal); + + Status s = LogAndApplyToDefaultCF(edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + s.ToString().find( + "WAL 10 must not have smaller synced size than previous one") != + std::string::npos) + << s.ToString(); + } +} + +TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { + NewDB(); + + constexpr WalNumber kLogNumber0 = 10; + constexpr WalNumber kLogNumber1 = 20; + constexpr WalNumber kNonExistingNumber = 15; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add closed WALs. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kLogNumber0, wal); + edit.AddWal(kLogNumber1, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + // Delete WALs before a non-existing WAL. + VersionEdit edit; + edit.DeleteWalsBefore(kNonExistingNumber); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + // Recover a new VersionSet, WAL0 is deleted, WAL1 is not. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kLogNumber1) != wals.end()); + } +} + +TEST_F(VersionSetTest, DeleteAllWals) { + NewDB(); + + constexpr WalNumber kMaxLogNumber = 10; + constexpr uint64_t kSizeInBytes = 111; + + { + // Add a closed WAL. + VersionEdit edit; + WalMetadata wal(kSizeInBytes); + edit.AddWal(kMaxLogNumber, wal); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + { + VersionEdit edit; + edit.DeleteWalsBefore(kMaxLogNumber + 10); + + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + } + + // Recover a new VersionSet, all WALs are deleted. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(new_versions->Recover(column_families_, false)); + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 0); + } +} + +TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { + NewDB(); + + constexpr int kAtomicGroupSize = 7; + constexpr uint64_t kNumWals = 5; + const std::string kDBId = "db_db"; + + int remaining = kAtomicGroupSize; + autovector> edits; + // Add 5 WALs. + for (uint64_t i = 1; i <= kNumWals; i++) { + edits.emplace_back(new VersionEdit); + // WAL's size equals its log number. + edits.back()->AddWal(i, WalMetadata(i)); + edits.back()->MarkAtomicGroup(--remaining); + } + // One edit with the min log number set. + edits.emplace_back(new VersionEdit); + edits.back()->SetDBId(kDBId); + edits.back()->MarkAtomicGroup(--remaining); + // Delete the first added 4 WALs. + edits.emplace_back(new VersionEdit); + edits.back()->DeleteWalsBefore(kNumWals); + edits.back()->MarkAtomicGroup(--remaining); + ASSERT_EQ(remaining, 0); + + ASSERT_OK(LogAndApplyToDefaultCF(edits)); + + // Recover a new VersionSet, the min log number and the last WAL should be + // kept. + { + std::unique_ptr new_versions( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + std::string db_id; + ASSERT_OK( + new_versions->Recover(column_families_, /*read_only=*/false, &db_id)); + + ASSERT_EQ(db_id, kDBId); + + const auto& wals = new_versions->GetWalSet().GetWals(); + ASSERT_EQ(wals.size(), 1); + ASSERT_TRUE(wals.find(kNumWals) != wals.end()); + ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize()); + ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals); + } +} + +class VersionSetWithTimestampTest : public VersionSetTest { + public: + static const std::string kNewCfName; + + explicit VersionSetWithTimestampTest() : VersionSetTest() {} + + void SetUp() override { + NewDB(); + Options options; + options.comparator = test::ComparatorWithU64Ts(); + cfd_ = CreateColumnFamily(kNewCfName, options); + EXPECT_NE(nullptr, cfd_); + EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions()); + column_families_.emplace_back(kNewCfName, options); + } + + void TearDown() override { + for (auto* e : edits_) { + delete e; + } + edits_.clear(); + } + + void GenVersionEditsToSetFullHistoryTsLow( + const std::vector& ts_lbs) { + for (const auto ts_lb : ts_lbs) { + VersionEdit* edit = new VersionEdit; + edit->SetColumnFamily(cfd_->GetID()); + std::string ts_str = test::EncodeInt(ts_lb); + edit->SetFullHistoryTsLow(ts_str); + edits_.emplace_back(edit); + } + } + + void VerifyFullHistoryTsLow(uint64_t expected_ts_low) { + std::unique_ptr vset( + new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); + ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false, + /*db_id=*/nullptr)); + for (auto* cfd : *(vset->GetColumnFamilySet())) { + ASSERT_NE(nullptr, cfd); + if (cfd->GetName() == kNewCfName) { + ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow()); + } else { + ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty()); + } + } + } + + void DoTest(const std::vector& ts_lbs) { + if (ts_lbs.empty()) { + return; + } + + GenVersionEditsToSetFullHistoryTsLow(ts_lbs); + + Status s; + mutex_.Lock(); + s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), + edits_, &mutex_); + mutex_.Unlock(); + ASSERT_OK(s); + VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); + } + + protected: + ColumnFamilyData* cfd_{nullptr}; + // edits_ must contain and own pointers to heap-alloc VersionEdit objects. + autovector edits_; +}; + +const std::string VersionSetWithTimestampTest::kNewCfName("new_cf"); + +TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) { + constexpr uint64_t kTsLow = 100; + DoTest({kTsLow}); +} + +// Simulate the application increasing full_history_ts_low. +TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) { + const std::vector ts_lbs = {100, 101, 102, 103}; + DoTest(ts_lbs); +} + +// Simulate the application trying to decrease full_history_ts_low +// unsuccessfully. If the application calls public API sequentially to +// decrease the lower bound ts, RocksDB will return an InvalidArgument +// status before involving VersionSet. Only when multiple threads trying +// to decrease the lower bound concurrently will this case ever happen. Even +// so, the lower bound cannot be decreased. The application will be notified +// via return value of the API. +TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) { + const std::vector ts_lbs = {103, 102, 101, 100}; + DoTest(ts_lbs); +} + class VersionSetAtomicGroupTest : public VersionSetTestBase, public testing::Test { public: @@ -1242,12 +1887,9 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, last_in_atomic_group_ = true; }); SyncPoint::GetInstance()->SetCallBack( - "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) { + "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) { num_recovered_edits_ = *reinterpret_cast(arg); }); - SyncPoint::GetInstance()->SetCallBack( - "ReactiveVersionSet::ReadAndApply:AppliedEdits", - [&](void* arg) { num_applied_edits_ = *reinterpret_cast(arg); }); SyncPoint::GetInstance()->SetCallBack( "AtomicGroupReadBuffer::AddEdit:AtomicGroup", [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); @@ -1287,7 +1929,6 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase, bool last_in_atomic_group_ = false; int num_edits_in_atomic_group_ = 0; int num_recovered_edits_ = 0; - int num_applied_edits_ = 0; VersionEdit corrupted_edit_; VersionEdit edit_with_incorrect_group_size_; std::unique_ptr log_writer_; @@ -1303,7 +1944,6 @@ TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) { EXPECT_TRUE(first_in_atomic_group_); EXPECT_TRUE(last_in_atomic_group_); EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1325,7 +1965,6 @@ TEST_F(VersionSetAtomicGroupTest, EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1338,20 +1977,20 @@ TEST_F(VersionSetAtomicGroupTest, EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, &manifest_reporter, &manifest_reader_status)); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); AddNewEditsToLog(kAtomicGroupSize); InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_TRUE(first_in_atomic_group_); EXPECT_TRUE(last_in_atomic_group_); // The recover should clean up the replay buffer. EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); - EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); + EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1367,7 +2006,6 @@ TEST_F(VersionSetAtomicGroupTest, EXPECT_FALSE(last_in_atomic_group_); EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1399,14 +2037,13 @@ TEST_F(VersionSetAtomicGroupTest, InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); // Reactive version set should be empty now. EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1423,13 +2060,14 @@ TEST_F(VersionSetAtomicGroupTest, &manifest_reader_status)); EXPECT_EQ(column_families_.size(), reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); // Write a few edits in an atomic group. AddNewEditsToLog(kNumberOfPersistedVersionEdits); InstrumentedMutex mu; std::unordered_set cfds_changed; mu.Lock(); - EXPECT_OK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_OK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_TRUE(first_in_atomic_group_); EXPECT_FALSE(last_in_atomic_group_); @@ -1438,8 +2076,6 @@ TEST_F(VersionSetAtomicGroupTest, EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == kNumberOfPersistedVersionEdits); EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); - EXPECT_EQ(num_initial_edits_, num_recovered_edits_); - EXPECT_EQ(0, num_applied_edits_); } TEST_F(VersionSetAtomicGroupTest, @@ -1486,8 +2122,8 @@ TEST_F(VersionSetAtomicGroupTest, // Write the corrupted edits. AddNewEditsToLog(kAtomicGroupSize); mu.Lock(); - EXPECT_NOK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_NOK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), corrupted_edit_.DebugString()); @@ -1536,8 +2172,8 @@ TEST_F(VersionSetAtomicGroupTest, &manifest_reader_status)); AddNewEditsToLog(kAtomicGroupSize); mu.Lock(); - EXPECT_NOK( - reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + EXPECT_NOK(reactive_versions_->ReadAndApply( + &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed)); mu.Unlock(); EXPECT_EQ(edits_[1].DebugString(), edit_with_incorrect_group_size_.DebugString()); @@ -1651,10 +2287,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); - if (cfd_to_drop->Unref()) { - delete cfd_to_drop; - cfd_to_drop = nullptr; - } + cfd_to_drop->UnrefAndTryDelete(); } INSTANTIATE_TEST_CASE_P( @@ -1674,14 +2307,13 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase, assert(log_writer != nullptr); VersionEdit new_db; new_db.SetLogNumber(0); - std::unique_ptr file; const std::string manifest_path = DescriptorFileName(dbname_, 1); - Status s = env_->NewWritableFile( - manifest_path, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest_path, fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); ASSERT_OK(s); - std::unique_ptr file_writer( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)), - manifest_path, env_options_)); log_writer->reset(new log::Writer(std::move(file_writer), 0, true)); std::string record; ASSERT_TRUE(new_db.EncodeTo(&record)); @@ -1741,19 +2373,20 @@ class VersionSetTestEmptyDb assert(nullptr != log_writer); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - std::unique_ptr impl(new DBImpl(DBOptions(), dbname_)); + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; impl->GetDbIdentityFromIdentityFile(&db_id); new_db.SetDBId(db_id); } const std::string manifest_path = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest_path, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest_path, fs->OptimizeForManifestWrite(env_options_), + &file_writer, nullptr); ASSERT_OK(s); - std::unique_ptr file_writer( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)), - manifest_path, env_options_)); { log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); std::string record; @@ -2057,16 +2690,18 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, assert(last_seqno != nullptr); assert(log_writer != nullptr); const std::string manifest = DescriptorFileName(dbname_, 1); - std::unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + Status s = WritableFileWriter::Create( + fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer, + nullptr); ASSERT_OK(s); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_)); log_writer->reset(new log::Writer(std::move(file_writer), 0, false)); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { - std::unique_ptr impl(new DBImpl(DBOptions(), dbname_)); + DBOptions tmp_db_options; + tmp_db_options.env = env_; + std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; impl->GetDbIdentityFromIdentityFile(&db_id); new_db.SetDBId(db_id); @@ -2144,18 +2779,17 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, std::unique_ptr file; Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr); ASSERT_OK(s); - std::unique_ptr fwriter( - new WritableFileWriter(std::move(file), fname, FileOptions(), env_)); - std::vector> - int_tbl_prop_collector_factories; + std::unique_ptr fwriter(new WritableFileWriter( + std::move(file), fname, FileOptions(), env_->GetSystemClock().get())); + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::unique_ptr builder(table_factory_->NewTableBuilder( TableBuilderOptions( - immutable_cf_options_, mutable_cf_options_, *internal_comparator_, + immutable_options_, mutable_cf_options_, *internal_comparator_, &int_tbl_prop_collector_factories, kNoCompression, - /*_sample_for_compression=*/0, CompressionOptions(), - /*_skip_filters=*/false, info.column_family, info.level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + info.column_family, info.level), fwriter.get())); InternalKey ikey(info.key, 0, ValueType::kTypeValue); builder->Add(ikey.Encode(), "value"); @@ -2165,11 +2799,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); ASSERT_OK(s); ASSERT_NE(0, file_size); - FileMetaData meta; - meta = FileMetaData(file_num, /*file_path_id=*/0, file_size, ikey, ikey, - 0, 0, false, 0, 0, 0, kUnknownFileChecksum, - kUnknownFileChecksumFuncName); - file_metas->emplace_back(meta); + file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, + ikey, 0, 0, false, 0, 0, 0, kUnknownFileChecksum, + kUnknownFileChecksumFuncName); } } @@ -2364,6 +2996,27 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) { } } +TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { + NewDB(); + + SstInfo sst(100, kDefaultColumnFamilyName, "a"); + std::vector file_metas; + CreateDummyTableFiles({sst}, &file_metas); + + constexpr WalNumber kMinWalNumberToKeep2PC = 10; + VersionEdit edit; + edit.AddFile(0, file_metas[0]); + edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC); + ASSERT_OK(LogAndApplyToDefaultCF(edit)); + ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC); + + for (int i = 0; i < 3; i++) { + CreateNewManifest(); + ReopenDB(); + ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/wal_edit.cc b/db/wal_edit.cc index f1939534476..786d68b5c83 100644 --- a/db/wal_edit.cc +++ b/db/wal_edit.cc @@ -19,10 +19,6 @@ void WalAddition::EncodeTo(std::string* dst) const { PutVarint64(dst, metadata_.GetSyncedSizeInBytes()); } - if (metadata_.IsClosed()) { - PutVarint32(dst, static_cast(WalAdditionTag::kClosed)); - } - PutVarint32(dst, static_cast(WalAdditionTag::kTerminate)); } @@ -48,10 +44,6 @@ Status WalAddition::DecodeFrom(Slice* src) { metadata_.SetSyncedSizeInBytes(size); break; } - case WalAdditionTag::kClosed: { - metadata_.SetClosed(); - break; - } // TODO: process future tags such as checksum. case WalAdditionTag::kTerminate: return Status::OK(); @@ -66,15 +58,13 @@ Status WalAddition::DecodeFrom(Slice* src) { JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) { jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes" - << wal.GetMetadata().GetSyncedSizeInBytes() << "Closed" - << wal.GetMetadata().IsClosed(); + << wal.GetMetadata().GetSyncedSizeInBytes(); return jw; } std::ostream& operator<<(std::ostream& os, const WalAddition& wal) { os << "log_number: " << wal.GetLogNumber() - << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes() - << " closed: " << wal.GetMetadata().IsClosed(); + << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes(); return os; } @@ -115,33 +105,30 @@ std::string WalDeletion::DebugString() const { } Status WalSet::AddWal(const WalAddition& wal) { + if (wal.GetLogNumber() < min_wal_number_to_keep_) { + // The WAL has been obsolete, ignore it. + return Status::OK(); + } + auto it = wals_.lower_bound(wal.GetLogNumber()); bool existing = it != wals_.end() && it->first == wal.GetLogNumber(); - if (wal.GetMetadata().IsClosed()) { - // The WAL must exist and not closed. - if (!existing) { - std::stringstream ss; - ss << "WAL " << wal.GetLogNumber() << " is not created before closing"; - return Status::Corruption("WalSet", ss.str()); - } - if (it->second.IsClosed()) { - std::stringstream ss; - ss << "WAL " << wal.GetLogNumber() << " is closed more than once"; - return Status::Corruption("WalSet", ss.str()); - } + if (existing && !wal.GetMetadata().HasSyncedSize()) { + std::stringstream ss; + ss << "WAL " << wal.GetLogNumber() << " is created more than once"; + return Status::Corruption("WalSet::AddWal", ss.str()); } // If the WAL has synced size, it must >= the previous size. - if (existing && it->second.HasSyncedSize() && - (!wal.GetMetadata().HasSyncedSize() || - wal.GetMetadata().GetSyncedSizeInBytes() < - it->second.GetSyncedSizeInBytes())) { + if (wal.GetMetadata().HasSyncedSize() && existing && + it->second.HasSyncedSize() && + wal.GetMetadata().GetSyncedSizeInBytes() < + it->second.GetSyncedSizeInBytes()) { std::stringstream ss; ss << "WAL " << wal.GetLogNumber() << " must not have smaller synced size than previous one"; - return Status::Corruption("WalSet", ss.str()); + return Status::Corruption("WalSet::AddWal", ss.str()); } if (existing) { - it->second = wal.GetMetadata(); + it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes()); } else { wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()}); } @@ -159,36 +146,19 @@ Status WalSet::AddWals(const WalAdditions& wals) { return s; } -Status WalSet::DeleteWal(const WalDeletion& wal) { - auto it = wals_.find(wal.GetLogNumber()); - // The WAL must exist and has been closed. - if (it == wals_.end()) { - std::stringstream ss; - ss << "WAL " << wal.GetLogNumber() << " must exist before deletion"; - return Status::Corruption("WalSet", ss.str()); - } - if (!it->second.IsClosed()) { - std::stringstream ss; - ss << "WAL " << wal.GetLogNumber() << " must be closed before deletion"; - return Status::Corruption("WalSet", ss.str()); +Status WalSet::DeleteWalsBefore(WalNumber wal) { + if (wal > min_wal_number_to_keep_) { + min_wal_number_to_keep_ = wal; + wals_.erase(wals_.begin(), wals_.lower_bound(wal)); } - wals_.erase(it); return Status::OK(); } -Status WalSet::DeleteWals(const WalDeletions& wals) { - Status s; - for (const WalDeletion& wal : wals) { - s = DeleteWal(wal); - if (!s.ok()) { - break; - } - } - return s; +void WalSet::Reset() { + wals_.clear(); + min_wal_number_to_keep_ = 0; } -void WalSet::Reset() { wals_.clear(); } - Status WalSet::CheckWals( Env* env, const std::unordered_map& logs_on_disk) const { diff --git a/db/wal_edit.h b/db/wal_edit.h index 9493c3648bb..7e1f9a5762d 100644 --- a/db/wal_edit.h +++ b/db/wal_edit.h @@ -35,10 +35,6 @@ class WalMetadata { explicit WalMetadata(uint64_t synced_size_bytes) : synced_size_bytes_(synced_size_bytes) {} - bool IsClosed() const { return closed_; } - - void SetClosed() { closed_ = true; } - bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; } void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; } @@ -52,9 +48,6 @@ class WalMetadata { // Size of the most recently synced WAL in bytes. uint64_t synced_size_bytes_ = kUnknownWalSize; - - // Whether the WAL is closed. - bool closed_ = false; }; // These tags are persisted to MANIFEST, so it's part of the user API. @@ -63,8 +56,6 @@ enum class WalAdditionTag : uint32_t { kTerminate = 1, // Synced Size in bytes. kSyncedSize = 2, - // Whether the WAL is closed. - kClosed = 3, // Add tags in the future, such as checksum? }; @@ -98,10 +89,10 @@ JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal); using WalAdditions = std::vector; -// Records the event of deleting/archiving a WAL in VersionEdit. +// Records the event of deleting WALs before the specified log number. class WalDeletion { public: - WalDeletion() : number_(0) {} + WalDeletion() : number_(kEmpty) {} explicit WalDeletion(WalNumber number) : number_(number) {} @@ -113,18 +104,22 @@ class WalDeletion { std::string DebugString() const; + bool IsEmpty() const { return number_ == kEmpty; } + + void Reset() { number_ = kEmpty; } + private: + static constexpr WalNumber kEmpty = 0; + WalNumber number_; }; std::ostream& operator<<(std::ostream& os, const WalDeletion& wal); JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal); -using WalDeletions = std::vector; - // Used in VersionSet to keep the current set of WALs. // -// When a WAL is created, closed, deleted, or archived, +// When a WAL is synced or becomes obsoleted, // a VersionEdit is logged to MANIFEST and // the WAL is added to or deleted from WalSet. // @@ -139,16 +134,16 @@ class WalSet { Status AddWal(const WalAddition& wal); Status AddWals(const WalAdditions& wals); - // Delete WAL(s). - // The WAL to be deleted must exist and be closed, otherwise, - // return Status::Corruption. + // Delete WALs with log number smaller than the specified wal number. // Can happen when applying a VersionEdit or recovering from MANIFEST. - Status DeleteWal(const WalDeletion& wal); - Status DeleteWals(const WalDeletions& wals); + Status DeleteWalsBefore(WalNumber wal); // Resets the internal state. void Reset(); + // WALs with number less than MinWalNumberToKeep should not exist in WalSet. + WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; } + const std::map& GetWals() const { return wals_; } // Checks whether there are missing or corrupted WALs. @@ -163,6 +158,9 @@ class WalSet { private: std::map wals_; + // WAL number < min_wal_number_to_keep_ should not exist in wals_. + // It's monotonically increasing, in-memory only, not written to MANIFEST. + WalNumber min_wal_number_to_keep_ = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/wal_edit_test.cc b/db/wal_edit_test.cc index b6eb347a300..5895e180d1c 100644 --- a/db/wal_edit_test.cc +++ b/db/wal_edit_test.cc @@ -24,18 +24,8 @@ TEST(WalSet, AddDeleteReset) { } ASSERT_EQ(wals.GetWals().size(), 10); - // Close WAL 1 - 5. - for (WalNumber log_number = 1; log_number <= 5; log_number++) { - WalMetadata wal(100); - wal.SetClosed(); - wals.AddWal(WalAddition(log_number, wal)); - } - ASSERT_EQ(wals.GetWals().size(), 10); - // Delete WAL 1 - 5. - for (WalNumber log_number = 1; log_number <= 5; log_number++) { - wals.DeleteWal(WalDeletion(log_number)); - } + wals.DeleteWalsBefore(6); ASSERT_EQ(wals.GetWals().size(), 5); WalNumber expected_log_number = 6; @@ -72,65 +62,43 @@ TEST(WalSet, SmallerSyncedSize) { std::string::npos); } -TEST(WalSet, CloseTwice) { +TEST(WalSet, CreateTwice) { constexpr WalNumber kNumber = 100; - constexpr uint64_t kBytes = 200; WalSet wals; ASSERT_OK(wals.AddWal(WalAddition(kNumber))); - WalMetadata wal(kBytes); - wal.SetClosed(); - ASSERT_OK(wals.AddWal(WalAddition(kNumber, wal))); - Status s = wals.AddWal(WalAddition(kNumber, wal)); + Status s = wals.AddWal(WalAddition(kNumber)); ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE(s.ToString().find("WAL 100 is closed more than once") != + ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") != std::string::npos); } -TEST(WalSet, CloseBeforeCreate) { - constexpr WalNumber kNumber = 100; - constexpr uint64_t kBytes = 200; +TEST(WalSet, DeleteAllWals) { + constexpr WalNumber kMaxWalNumber = 10; WalSet wals; - WalMetadata wal(kBytes); - wal.SetClosed(); - Status s = wals.AddWal(WalAddition(kNumber, wal)); - ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE(s.ToString().find("WAL 100 is not created before closing") != - std::string::npos); + for (WalNumber i = 1; i <= kMaxWalNumber; i++) { + wals.AddWal(WalAddition(i)); + } + ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1)); } -TEST(WalSet, CreateAfterClose) { +TEST(WalSet, AddObsoleteWal) { constexpr WalNumber kNumber = 100; - constexpr uint64_t kBytes = 200; WalSet wals; + ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1)); ASSERT_OK(wals.AddWal(WalAddition(kNumber))); - WalMetadata wal(kBytes); - wal.SetClosed(); - ASSERT_OK(wals.AddWal(WalAddition(kNumber, wal))); - Status s = wals.AddWal(WalAddition(kNumber)); - ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE( - s.ToString().find( - "WAL 100 must not have smaller synced size than previous one") != - std::string::npos); -} - -TEST(WalSet, DeleteNonExistingWal) { - constexpr WalNumber kNonExistingNumber = 100; - WalSet wals; - Status s = wals.DeleteWal(WalDeletion(kNonExistingNumber)); - ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE(s.ToString().find("WAL 100 must exist before deletion") != - std::string::npos); + ASSERT_TRUE(wals.GetWals().empty()); } -TEST(WalSet, DeleteNonClosedWal) { - constexpr WalNumber kNonClosedWalNumber = 100; +TEST(WalSet, MinWalNumberToKeep) { + constexpr WalNumber kNumber = 100; WalSet wals; - ASSERT_OK(wals.AddWal(WalAddition(kNonClosedWalNumber))); - Status s = wals.DeleteWal(WalDeletion(kNonClosedWalNumber)); - ASSERT_TRUE(s.IsCorruption()); - ASSERT_TRUE(s.ToString().find("WAL 100 must be closed before deletion") != - std::string::npos); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0); + ASSERT_OK(wals.DeleteWalsBefore(kNumber)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber); + ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber); + ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1)); + ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1); } class WalSetTest : public DBTestBase { @@ -165,7 +133,6 @@ class WalSetTest : public DBTestBase { ASSERT_OK(wals_.AddWal(WalAddition(number))); // Close WAL. WalMetadata wal(size_bytes); - wal.SetClosed(); ASSERT_OK(wals_.AddWal(WalAddition(number, wal))); } diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 327f89cdf1d..359cc154678 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -134,14 +134,14 @@ Status WalManager::GetUpdatesSince( // b. get sorted non-empty archived logs // c. delete what should be deleted void WalManager::PurgeObsoleteWALFiles() { - bool const ttl_enabled = db_options_.wal_ttl_seconds > 0; - bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0; + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; if (!ttl_enabled && !size_limit_enabled) { return; } - int64_t current_time; - Status s = env_->GetCurrentTime(¤t_time); + int64_t current_time = 0; + Status s = db_options_.clock->GetCurrentTime(¤t_time); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s", s.ToString().c_str()); @@ -150,7 +150,7 @@ void WalManager::PurgeObsoleteWALFiles() { } uint64_t const now_seconds = static_cast(current_time); uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) - ? db_options_.wal_ttl_seconds / 2 + ? db_options_.WAL_ttl_seconds / 2 : kDefaultIntervalToDeleteObsoleteWAL; if (purge_wal_files_last_run_ + time_to_check > now_seconds) { @@ -171,11 +171,10 @@ void WalManager::PurgeObsoleteWALFiles() { size_t log_files_num = 0; uint64_t log_file_size = 0; - for (auto& f : files) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { + if (ParseFileName(f, &number, &type) && type == kWalFile) { std::string const file_path = archival_dir + "/" + f; if (ttl_enabled) { uint64_t file_m_time; @@ -186,7 +185,7 @@ void WalManager::PurgeObsoleteWALFiles() { s.ToString().c_str()); continue; } - if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) { + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { s = DeleteDBFile(&db_options_, file_path, archival_dir, false, /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { @@ -235,17 +234,21 @@ void WalManager::PurgeObsoleteWALFiles() { return; } - size_t const files_keep_num = - static_cast(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size); + size_t const files_keep_num = static_cast( + db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size); if (log_files_num <= files_keep_num) { return; } size_t files_del_num = log_files_num - files_keep_num; VectorLogPtr archived_logs; - GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); - - if (files_del_num > archived_logs.size()) { + s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Unable to get archived WALs from: %s: %s", + archival_dir.c_str(), s.ToString().c_str()); + files_del_num = 0; + } else if (files_del_num > archived_logs.size()) { ROCKS_LOG_WARN(db_options_.info_log, "Trying to delete more archived log files than " "exist. Deleting all"); @@ -292,7 +295,7 @@ Status WalManager::GetSortedWalsOfType(const std::string& path, for (const auto& f : all_files) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { + if (ParseFileName(f, &number, &type) && type == kWalFile) { SequenceNumber sequence; Status s = ReadFirstRecord(log_type, number, &sequence); if (!s.ok()) { diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 87c168b703c..580379a6c9e 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -5,20 +5,21 @@ #ifndef ROCKSDB_LITE +#include "db/wal_manager.h" + #include #include -#include "rocksdb/cache.h" -#include "rocksdb/write_batch.h" -#include "rocksdb/write_buffer_manager.h" - #include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/log_writer.h" #include "db/version_set.h" -#include "db/wal_manager.h" #include "env/mock_env.h" #include "file/writable_file_writer.h" +#include "rocksdb/cache.h" +#include "rocksdb/file_system.h" +#include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -47,13 +48,14 @@ class WalManagerTest : public testing::Test { std::numeric_limits::max()); db_options_.wal_dir = dbname_; db_options_.env = env_.get(); - fs_.reset(new LegacyFileSystemWrapper(env_.get())); - db_options_.fs = fs_; + db_options_.fs = env_->GetFileSystem(); + db_options_.clock = env_->GetSystemClock().get(); versions_.reset( new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)); + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_session_id*/ "")); wal_manager_.reset( new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); @@ -69,9 +71,10 @@ class WalManagerTest : public testing::Test { assert(current_log_writer_.get() != nullptr); uint64_t seq = versions_->LastSequence() + 1; WriteBatch batch; - batch.Put(key, value); + ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); - current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)); + ASSERT_OK( + current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch))); versions_->SetLastAllocatedSequence(seq); versions_->SetLastPublishedSequence(seq); versions_->SetLastSequence(seq); @@ -81,10 +84,10 @@ class WalManagerTest : public testing::Test { void RollTheLog(bool /*archived*/) { current_log_number_++; std::string fname = ArchivedLogFileName(dbname_, current_log_number_); - std::unique_ptr file; - ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_)); + const auto& fs = env_->GetFileSystem(); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer, + nullptr)); current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); } @@ -115,7 +118,6 @@ class WalManagerTest : public testing::Test { WriteBufferManager write_buffer_manager_; std::unique_ptr versions_; std::unique_ptr wal_manager_; - std::shared_ptr fs_; std::unique_ptr current_log_writer_; uint64_t current_log_number_; @@ -124,8 +126,9 @@ class WalManagerTest : public testing::Test { TEST_F(WalManagerTest, ReadFirstRecordCache) { Init(); std::string path = dbname_ + "/000001.log"; - std::unique_ptr file; - ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); + std::unique_ptr file; + ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file, + nullptr)); SequenceNumber s; ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s)); @@ -135,14 +138,14 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s)); ASSERT_EQ(s, 0U); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions())); + std::unique_ptr file_writer( + new WritableFileWriter(std::move(file), path, FileOptions())); log::Writer writer(std::move(file_writer), 1, db_options_.recycle_log_file_num > 0); WriteBatch batch; - batch.Put("foo", "bar"); + ASSERT_OK(batch.Put("foo", "bar")); WriteBatchInternal::SetSequence(&batch, 10); - writer.AddRecord(WriteBatchInternal::Contents(&batch)); + ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. // Waiting for lei to finish with db_test @@ -167,14 +170,14 @@ namespace { uint64_t GetLogDirSize(std::string dir_path, Env* env) { uint64_t dir_size = 0; std::vector files; - env->GetChildren(dir_path, &files); + EXPECT_OK(env->GetChildren(dir_path, &files)); for (auto& f : files) { uint64_t number; FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { + if (ParseFileName(f, &number, &type) && type == kWalFile) { std::string const file_path = dir_path + "/" + f; uint64_t file_size; - env->GetFileSize(file_path, &file_size); + EXPECT_OK(env->GetFileSize(file_path, &file_size)); dir_size += file_size; } } @@ -184,9 +187,9 @@ std::vector ListSpecificFiles( Env* env, const std::string& path, const FileType expected_file_type) { std::vector files; std::vector file_numbers; - env->GetChildren(path, &files); uint64_t number; FileType type; + EXPECT_OK(env->GetChildren(path, &files)); for (size_t i = 0; i < files.size(); ++i) { if (ParseFileName(files[i], &number, &type)) { if (type == expected_file_type) { @@ -209,13 +212,14 @@ int CountRecords(TransactionLogIterator* iter) { EXPECT_OK(iter->status()); iter->Next(); } + EXPECT_OK(iter->status()); return count; } } // namespace TEST_F(WalManagerTest, WALArchivalSizeLimit) { - db_options_.wal_ttl_seconds = 0; - db_options_.wal_size_limit_mb = 1000; + db_options_.WAL_ttl_seconds = 0; + db_options_.WAL_size_limit_MB = 1000; Init(); // TEST : Create WalManager with huge size limit and no ttl. @@ -223,7 +227,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) { // Count the archived log files that survived. // Assert that all of them did. // Change size limit. Re-open WalManager. - // Assert that archive is not greater than wal_size_limit_mb after + // Assert that archive is not greater than WAL_size_limit_MB after // PurgeObsoleteWALFiles() // Set ttl and time_to_check_ to small values. Re-open db. // Assert that there are no archived logs left. @@ -232,27 +236,27 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) { CreateArchiveLogs(20, 5000); std::vector log_files = - ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_EQ(log_files.size(), 20U); - db_options_.wal_size_limit_mb = 8; + db_options_.WAL_size_limit_MB = 8; Reopen(); wal_manager_->PurgeObsoleteWALFiles(); uint64_t archive_size = GetLogDirSize(archive_dir, env_.get()); - ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024); + ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024); - db_options_.wal_ttl_seconds = 1; + db_options_.WAL_ttl_seconds = 1; env_->FakeSleepForMicroseconds(2 * 1000 * 1000); Reopen(); wal_manager_->PurgeObsoleteWALFiles(); - log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile); + log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_TRUE(log_files.empty()); } TEST_F(WalManagerTest, WALArchivalTtl) { - db_options_.wal_ttl_seconds = 1000; + db_options_.WAL_ttl_seconds = 1000; Init(); // TEST : Create WalManager with a ttl and no size limit. @@ -265,15 +269,15 @@ TEST_F(WalManagerTest, WALArchivalTtl) { CreateArchiveLogs(20, 5000); std::vector log_files = - ListSpecificFiles(env_.get(), archive_dir, kLogFile); + ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_GT(log_files.size(), 0U); - db_options_.wal_ttl_seconds = 1; + db_options_.WAL_ttl_seconds = 1; env_->FakeSleepForMicroseconds(3 * 1000 * 1000); Reopen(); wal_manager_->PurgeObsoleteWALFiles(); - log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile); + log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile); ASSERT_TRUE(log_files.empty()); } diff --git a/db/write_batch.cc b/db/write_batch.cc index f9b13406863..1d9423e0d87 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -46,6 +46,7 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" +#include "db/kv_checksum.h" #include "db/memtable.h" #include "db/merge_context.h" #include "db/snapshot_impl.h" @@ -55,6 +56,7 @@ #include "monitoring/statistics.h" #include "port/lang.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/system_clock.h" #include "util/autovector.h" #include "util/cast_util.h" #include "util/coding.h" @@ -140,10 +142,14 @@ struct BatchContentClassifier : public WriteBatch::Handler { class TimestampAssigner : public WriteBatch::Handler { public: - explicit TimestampAssigner(const Slice& ts) - : timestamp_(ts), timestamps_(kEmptyTimestampList) {} - explicit TimestampAssigner(const std::vector& ts_list) - : timestamps_(ts_list) { + explicit TimestampAssigner(const Slice& ts, + WriteBatch::ProtectionInfo* prot_info) + : timestamp_(ts), + timestamps_(kEmptyTimestampList), + prot_info_(prot_info) {} + explicit TimestampAssigner(const std::vector& ts_list, + WriteBatch::ProtectionInfo* prot_info) + : timestamps_(ts_list), prot_info_(prot_info) { SanityCheck(); } ~TimestampAssigner() override {} @@ -167,9 +173,8 @@ class TimestampAssigner : public WriteBatch::Handler { } Status DeleteRangeCF(uint32_t, const Slice& begin_key, - const Slice& end_key) override { + const Slice& /* end_key */) override { AssignTimestamp(begin_key); - AssignTimestamp(end_key); ++idx_; return Status::OK(); } @@ -221,12 +226,17 @@ class TimestampAssigner : public WriteBatch::Handler { const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_]; size_t ts_sz = ts.size(); char* ptr = const_cast(key.data() + key.size() - ts_sz); + if (prot_info_ != nullptr) { + Slice old_ts(ptr, ts_sz), new_ts(ts.data(), ts_sz); + prot_info_->entries_[idx_].UpdateT(old_ts, new_ts); + } memcpy(ptr, ts.data(), ts_sz); } static const std::vector kEmptyTimestampList; const Slice timestamp_; const std::vector& timestamps_; + WriteBatch::ProtectionInfo* const prot_info_; size_t idx_ = 0; // No copy or move. @@ -258,6 +268,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz) rep_.resize(WriteBatchInternal::kHeader); } +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz, + size_t protection_bytes_per_key) + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) { + // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per + // entry. + assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8); + if (protection_bytes_per_key != 0) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + } + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + WriteBatch::WriteBatch(const std::string& rep) : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), @@ -280,6 +305,10 @@ WriteBatch::WriteBatch(const WriteBatch& src) save_points_.reset(new SavePoints()); save_points_->stack = src.save_points_->stack; } + if (src.prot_info_ != nullptr) { + prot_info_.reset(new WriteBatch::ProtectionInfo()); + prot_info_->entries_ = src.prot_info_->entries_; + } } WriteBatch::WriteBatch(WriteBatch&& src) noexcept @@ -287,6 +316,7 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept wal_term_point_(std::move(src.wal_term_point_)), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), + prot_info_(std::move(src.prot_info_)), rep_(std::move(src.rep_)), timestamp_size_(src.timestamp_size_) {} @@ -331,6 +361,9 @@ void WriteBatch::Clear() { } } + if (prot_info_ != nullptr) { + prot_info_->entries_.clear(); + } wal_term_point_.clear(); } @@ -359,6 +392,13 @@ void WriteBatch::MarkWalTerminationPoint() { wal_term_point_.content_flags = content_flags_; } +size_t WriteBatch::GetProtectionBytesPerKey() const { + if (prot_info_ != nullptr) { + return prot_info_->GetBytesPerKey(); + } + return 0; +} + bool WriteBatch::HasPut() const { return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; } @@ -640,7 +680,8 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, case kTypeBeginPrepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); - handler->MarkBeginPrepare(); + s = handler->MarkBeginPrepare(); + assert(s.ok()); empty_batch = false; if (!handler->WriteAfterCommit()) { s = Status::NotSupported( @@ -659,7 +700,8 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, case kTypeBeginPersistedPrepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); - handler->MarkBeginPrepare(); + s = handler->MarkBeginPrepare(); + assert(s.ok()); empty_batch = false; if (handler->WriteAfterCommit()) { s = Status::NotSupported( @@ -672,7 +714,8 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, case kTypeBeginUnprepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); - handler->MarkBeginPrepare(true /* unprepared */); + s = handler->MarkBeginPrepare(true /* unprepared */); + assert(s.ok()); empty_batch = false; if (handler->WriteAfterCommit()) { s = Status::NotSupported( @@ -691,23 +734,27 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, case kTypeEndPrepareXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); - handler->MarkEndPrepare(xid); + s = handler->MarkEndPrepare(xid); + assert(s.ok()); empty_batch = true; break; case kTypeCommitXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); - handler->MarkCommit(xid); + s = handler->MarkCommit(xid); + assert(s.ok()); empty_batch = true; break; case kTypeRollbackXID: assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); - handler->MarkRollback(xid); + s = handler->MarkRollback(xid); + assert(s.ok()); empty_batch = true; break; case kTypeNoop: - handler->MarkNoop(empty_batch); + s = handler->MarkNoop(empty_batch); + assert(s.ok()); empty_batch = true; break; default: @@ -770,18 +817,31 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } + std::string timestamp(b->timestamp_size_, '\0'); if (0 == b->timestamp_size_) { PutLengthPrefixedSlice(&b->rep_, key); } else { PutVarint32(&b->rep_, static_cast(key.size() + b->timestamp_size_)); b->rep_.append(key.data(), key.size()); - b->rep_.append(b->timestamp_size_, '\0'); + b->rep_.append(timestamp); } PutLengthPrefixedSlice(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // Technically the optype could've been `kTypeColumnFamilyValue` with the + // CF ID encoded in the `WriteBatch`. That distinction is unimportant + // however since we verify CF ID is correct, as well as all other fields + // (a missing/extra encoded CF ID would corrupt another field). It is + // convenient to consolidate on `kTypeValue` here as that is what will be + // inserted into memtable. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, value, kTypeValue, timestamp) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -826,6 +886,7 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } + std::string timestamp(b->timestamp_size_, '\0'); if (0 == b->timestamp_size_) { PutLengthPrefixedSliceParts(&b->rep_, key); } else { @@ -835,6 +896,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, value, kTypeValue, timestamp) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -909,17 +978,26 @@ Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); PutVarint32(&b->rep_, column_family_id); } + std::string timestamp(b->timestamp_size_, '\0'); if (0 == b->timestamp_size_) { PutLengthPrefixedSlice(&b->rep_, key); } else { PutVarint32(&b->rep_, static_cast(key.size() + b->timestamp_size_)); b->rep_.append(key.data(), key.size()); - b->rep_.append(b->timestamp_size_, '\0'); + b->rep_.append(timestamp); } b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, "" /* value */, kTypeDeletion, timestamp) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -938,6 +1016,7 @@ Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyDeletion)); PutVarint32(&b->rep_, column_family_id); } + std::string timestamp(b->timestamp_size_, '\0'); if (0 == b->timestamp_size_) { PutLengthPrefixedSliceParts(&b->rep_, key); } else { @@ -946,6 +1025,16 @@ Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, + SliceParts(nullptr /* _parts */, 0 /* _num_parts */), + kTypeDeletion, timestamp) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -970,6 +1059,15 @@ Status WriteBatchInternal::SingleDelete(WriteBatch* b, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_SINGLE_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVOT(key, "" /* value */, + kTypeSingleDeletion, + "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -994,6 +1092,17 @@ Status WriteBatchInternal::SingleDelete(WriteBatch* b, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_SINGLE_DELETE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, + SliceParts(nullptr /* _parts */, + 0 /* _num_parts */) /* value */, + kTypeSingleDeletion, "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1019,6 +1128,16 @@ Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE_RANGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVOT(begin_key, end_key, + kTypeRangeDeletion, + "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1044,6 +1163,16 @@ Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_DELETE_RANGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + // In `DeleteRange()`, the end key is treated as the value. + b->prot_info_->entries_.emplace_back(ProtectionInfo64() + .ProtectKVOT(begin_key, end_key, + kTypeRangeDeletion, + "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1076,6 +1205,14 @@ Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_MERGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, value, kTypeMerge, "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1106,6 +1243,14 @@ Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_MERGE, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, value, kTypeMerge, "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1131,6 +1276,14 @@ Status WriteBatchInternal::PutBlobIndex(WriteBatch* b, b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_BLOB_INDEX, std::memory_order_relaxed); + if (b->prot_info_ != nullptr) { + // See comment in first `WriteBatchInternal::Put()` overload concerning the + // `ValueType` argument passed to `ProtectKVOT()`. + b->prot_info_->entries_.emplace_back( + ProtectionInfo64() + .ProtectKVOT(key, value, kTypeBlobIndex, "" /* timestamp */) + .ProtectC(column_family_id)); + } return save.commit(); } @@ -1169,6 +1322,9 @@ Status WriteBatch::RollbackToSavePoint() { Clear(); } else { rep_.resize(savepoint.size); + if (prot_info_ != nullptr) { + prot_info_->entries_.resize(savepoint.count); + } WriteBatchInternal::SetCount(this, savepoint.count); content_flags_.store(savepoint.content_flags, std::memory_order_relaxed); } @@ -1188,12 +1344,12 @@ Status WriteBatch::PopSavePoint() { } Status WriteBatch::AssignTimestamp(const Slice& ts) { - TimestampAssigner ts_assigner(ts); + TimestampAssigner ts_assigner(ts, prot_info_.get()); return Iterate(&ts_assigner); } Status WriteBatch::AssignTimestamps(const std::vector& ts_list) { - TimestampAssigner ts_assigner(ts_list); + TimestampAssigner ts_assigner(ts_list, prot_info_.get()); return Iterate(&ts_assigner); } @@ -1210,6 +1366,8 @@ class MemTableInserter : public WriteBatch::Handler { DBImpl* db_; const bool concurrent_memtable_writes_; bool post_info_created_; + const WriteBatch::ProtectionInfo* prot_info_; + size_t prot_info_idx_; bool* has_valid_writes_; // On some (!) platforms just default creating @@ -1272,6 +1430,16 @@ class MemTableInserter : public WriteBatch::Handler { (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_); } + const ProtectionInfoKVOTC64* NextProtectionInfo() { + const ProtectionInfoKVOTC64* res = nullptr; + if (prot_info_ != nullptr) { + assert(prot_info_idx_ < prot_info_->entries_.size()); + res = &prot_info_->entries_[prot_info_idx_]; + ++prot_info_idx_; + } + return res; + } + protected: bool WriteBeforePrepare() const override { return write_before_prepare_; } bool WriteAfterCommit() const override { return write_after_commit_; } @@ -1284,6 +1452,7 @@ class MemTableInserter : public WriteBatch::Handler { bool ignore_missing_column_families, uint64_t recovering_log_number, DB* db, bool concurrent_memtable_writes, + const WriteBatch::ProtectionInfo* prot_info, bool* has_valid_writes = nullptr, bool seq_per_batch = false, bool batch_per_txn = true, bool hint_per_batch = false) : sequence_(_sequence), @@ -1296,6 +1465,8 @@ class MemTableInserter : public WriteBatch::Handler { db_(static_cast_with_check(db)), concurrent_memtable_writes_(concurrent_memtable_writes), post_info_created_(false), + prot_info_(prot_info), + prot_info_idx_(0), has_valid_writes_(has_valid_writes), rebuilding_trx_(nullptr), rebuilding_trx_seq_(0), @@ -1353,6 +1524,10 @@ class MemTableInserter : public WriteBatch::Handler { } void set_log_number_ref(uint64_t log) { log_number_ref_ = log; } + void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) { + prot_info_ = prot_info; + prot_info_idx_ = 0; + } SequenceNumber sequence() const { return sequence_; } @@ -1408,27 +1583,34 @@ class MemTableInserter : public WriteBatch::Handler { } Status PutCFImpl(uint32_t column_family_id, const Slice& key, - const Slice& value, ValueType value_type) { + const Slice& value, ValueType value_type, + const ProtectionInfoKVOTS64* kv_prot_info) { // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, + value); // else insert the values to the memtable right away } Status ret_status; if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); return ret_status; } + assert(ret_status.ok()); MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetImmutableMemTableOptions(); @@ -1436,23 +1618,17 @@ class MemTableInserter : public WriteBatch::Handler { // any kind of transactions including the ones that use seq_per_batch assert(!seq_per_batch_ || !moptions->inplace_update_support); if (!moptions->inplace_update_support) { - bool mem_res = - mem->Add(sequence_, value_type, key, value, + ret_status = + mem->Add(sequence_, value_type, key, value, kv_prot_info, concurrent_memtable_writes_, get_post_process_info(mem), hint_per_batch_ ? &GetHintMap()[mem] : nullptr); - if (UNLIKELY(!mem_res)) { - assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); - } } else if (moptions->inplace_callback == nullptr) { assert(!concurrent_memtable_writes_); - mem->Update(sequence_, key, value); + ret_status = mem->Update(sequence_, key, value, kv_prot_info); } else { assert(!concurrent_memtable_writes_); - if (mem->UpdateCallback(sequence_, key, value)) { - } else { + ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info); + if (ret_status.IsNotFound()) { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; @@ -1466,94 +1642,149 @@ class MemTableInserter : public WriteBatch::Handler { std::string merged_value; auto cf_handle = cf_mems_->GetColumnFamilyHandle(); - Status s = Status::NotSupported(); + Status get_status = Status::NotSupported(); if (db_ != nullptr && recovering_log_number_ == 0) { if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } - s = db_->Get(ropts, cf_handle, key, &prev_value); + get_status = db_->Get(ropts, cf_handle, key, &prev_value); } - - char* prev_buffer = const_cast(prev_value.c_str()); - uint32_t prev_size = static_cast(prev_value.size()); - auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr, - s.ok() ? &prev_size : nullptr, - value, &merged_value); - if (status == UpdateStatus::UPDATED_INPLACE) { - // prev_value is updated in-place with final value. - bool mem_res __attribute__((__unused__)); - mem_res = mem->Add( - sequence_, value_type, key, Slice(prev_buffer, prev_size)); - assert(mem_res); - RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); - } else if (status == UpdateStatus::UPDATED) { - // merged_value contains the final value. - bool mem_res __attribute__((__unused__)); - mem_res = - mem->Add(sequence_, value_type, key, Slice(merged_value)); - assert(mem_res); - RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + // Intentionally overwrites the `NotFound` in `ret_status`. + if (!get_status.ok() && !get_status.IsNotFound()) { + ret_status = get_status; + } else { + ret_status = Status::OK(); + } + if (ret_status.ok()) { + UpdateStatus update_status; + char* prev_buffer = const_cast(prev_value.c_str()); + uint32_t prev_size = static_cast(prev_value.size()); + if (get_status.ok()) { + update_status = moptions->inplace_callback(prev_buffer, &prev_size, + value, &merged_value); + } else { + update_status = moptions->inplace_callback( + nullptr /* existing_value */, nullptr /* existing_value_size */, + value, &merged_value); + } + if (update_status == UpdateStatus::UPDATED_INPLACE) { + assert(get_status.ok()); + if (kv_prot_info != nullptr) { + ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, + Slice(prev_buffer, prev_size)); + // prev_value is updated in-place with final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + &updated_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, value_type, key, + Slice(prev_buffer, prev_size), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } else if (update_status == UpdateStatus::UPDATED) { + if (kv_prot_info != nullptr) { + ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info); + updated_kv_prot_info.UpdateV(value, merged_value); + // merged_value contains the final value. + ret_status = mem->Add(sequence_, value_type, key, + Slice(merged_value), &updated_kv_prot_info); + } else { + // merged_value contains the final value. + ret_status = + mem->Add(sequence_, value_type, key, Slice(merged_value), + nullptr /* kv_prot_info */); + } + if (ret_status.ok()) { + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); + } + } } } } + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id, + key, value); } - // Since all Puts are logged in transaction logs (if enabled), always bump - // sequence number. Even if the update eventually fails and does not result - // in memtable add/update. - MaybeAdvanceSeq(); - CheckMemtableFull(); return ret_status; } Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { - return PutCFImpl(column_family_id, key, value, kTypeValue); + const auto* kv_prot_info = NextProtectionInfo(); + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + return PutCFImpl(column_family_id, key, value, kTypeValue, + &mem_kv_prot_info); + } + return PutCFImpl(column_family_id, key, value, kTypeValue, + nullptr /* kv_prot_info */); } Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key, - const Slice& value, ValueType delete_type) { + const Slice& value, ValueType delete_type, + const ProtectionInfoKVOTS64* kv_prot_info) { Status ret_status; MemTable* mem = cf_mems_->GetMemTable(); - bool mem_res = - mem->Add(sequence_, delete_type, key, value, + ret_status = + mem->Add(sequence_, delete_type, key, value, kv_prot_info, concurrent_memtable_writes_, get_post_process_info(mem), hint_per_batch_ ? &GetHintMap()[mem] : nullptr); - if (UNLIKELY(!mem_res)) { + if (UNLIKELY(ret_status.IsTryAgain())) { assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); } - MaybeAdvanceSeq(); - CheckMemtableFull(); return ret_status; } Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); // else insert the values to the memtable right away } Status ret_status; if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); return ret_status; } @@ -1564,78 +1795,112 @@ class MemTableInserter : public WriteBatch::Handler { : 0; const ValueType delete_type = (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp; - ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type); + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type); + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type, + nullptr /* kv_prot_info */); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = + WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key); } return ret_status; } Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, + key); // else insert the values to the memtable right away } Status ret_status; if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, - key); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); return ret_status; } + assert(ret_status.ok()); - ret_status = - DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion); + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, key, Slice(), + kTypeSingleDeletion, nullptr /* kv_prot_info */); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_, + column_family_id, key); } return ret_status; } Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, const Slice& end_key) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, - begin_key, end_key); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, + begin_key, end_key); // else insert the values to the memtable right away } Status ret_status; if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, - begin_key, end_key); - // TODO(myabandeh): when transactional DeleteRange support is added, - // check if end_key must also be added. - batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); return ret_status; } + assert(ret_status.ok()); + if (db_ != nullptr) { auto cf_handle = cf_mems_->GetColumnFamilyHandle(); if (cf_handle == nullptr) { @@ -1644,6 +1909,8 @@ class MemTableInserter : public WriteBatch::Handler { auto* cfd = static_cast_with_check(cf_handle)->cfd(); if (!cfd->is_delete_range_supported()) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); return Status::NotSupported( std::string("DeleteRange not supported for table type ") + cfd->ioptions()->table_factory->Name() + " in CF " + @@ -1651,54 +1918,78 @@ class MemTableInserter : public WriteBatch::Handler { } int cmp = cfd->user_comparator()->Compare(begin_key, end_key); if (cmp > 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); // It's an empty range where endpoints appear mistaken. Don't bother // applying it to the DB, and return an error to the user. return Status::InvalidArgument("end key comes before start key"); } else if (cmp == 0) { + // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. + ret_status.PermitUncheckedError(); // It's an empty range. Don't bother applying it to the DB. return Status::OK(); } } - ret_status = - DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion); + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, &mem_kv_prot_info); + } else { + ret_status = DeleteImpl(column_family_id, begin_key, end_key, + kTypeRangeDeletion, nullptr /* kv_prot_info */); + } // optimize for non-recovery mode + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id, - begin_key, end_key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::DeleteRange( + rebuilding_trx_, column_family_id, begin_key, end_key); } return ret_status; } Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { + const auto* kv_prot_info = NextProtectionInfo(); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { - WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); - return Status::OK(); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, + value); // else insert the values to the memtable right away } Status ret_status; if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) { - bool batch_boundry = false; - if (rebuilding_trx_ != nullptr) { + if (ret_status.ok() && rebuilding_trx_ != nullptr) { assert(!write_after_commit_); // The CF is probably flushed and hence no need for insert but we still // need to keep track of the keys for upcoming rollback/commit. - WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, - value); - batch_boundry = IsDuplicateKeySeq(column_family_id, key); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, + column_family_id, key, value); + if (ret_status.ok()) { + MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key)); + } + } else if (ret_status.ok()) { + MaybeAdvanceSeq(false /* batch_boundary */); } - MaybeAdvanceSeq(batch_boundry); return ret_status; } + assert(ret_status.ok()); MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetImmutableMemTableOptions(); + if (moptions->merge_operator == nullptr) { + return Status::InvalidArgument( + "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`"); + } bool perform_merge = false; assert(!concurrent_memtable_writes_ || moptions->max_successive_merges == 0); @@ -1736,65 +2027,97 @@ class MemTableInserter : public WriteBatch::Handler { if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } - db_->Get(read_options, cf_handle, key, &get_value); - Slice get_value_slice = Slice(get_value); - - // 2) Apply this merge - auto merge_operator = moptions->merge_operator; - assert(merge_operator); - - std::string new_value; - - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator, key, &get_value_slice, {value}, &new_value, - moptions->info_log, moptions->statistics, Env::Default()); - - if (!merge_status.ok()) { - // Failed to merge! - // Store the delta in memtable + Status get_status = db_->Get(read_options, cf_handle, key, &get_value); + if (!get_status.ok()) { + // Failed to read a key we know exists. Store the delta in memtable. perform_merge = false; } else { - // 3) Add value to memtable - assert(!concurrent_memtable_writes_); - bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value); - if (UNLIKELY(!mem_res)) { - assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); + Slice get_value_slice = Slice(get_value); + + // 2) Apply this merge + auto merge_operator = moptions->merge_operator; + assert(merge_operator); + + std::string new_value; + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, &get_value_slice, {value}, &new_value, + moptions->info_log, moptions->statistics, + SystemClock::Default().get()); + + if (!merge_status.ok()) { + // Failed to merge! + // Store the delta in memtable + perform_merge = false; + } else { + // 3) Add value to memtable + assert(!concurrent_memtable_writes_); + if (kv_prot_info != nullptr) { + auto merged_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + merged_kv_prot_info.UpdateV(value, new_value); + merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue); + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + &merged_kv_prot_info); + } else { + ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + nullptr /* kv_prot_info */); + } } } } if (!perform_merge) { - // Add merge operator to memtable - bool mem_res = - mem->Add(sequence_, kTypeMerge, key, value, - concurrent_memtable_writes_, get_post_process_info(mem)); - if (UNLIKELY(!mem_res)) { - assert(seq_per_batch_); - ret_status = Status::TryAgain("key+seq exists"); - const bool BATCH_BOUNDRY = true; - MaybeAdvanceSeq(BATCH_BOUNDRY); + assert(ret_status.ok()); + // Add merge operand to memtable + if (kv_prot_info != nullptr) { + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + ret_status = + mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info, + concurrent_memtable_writes_, get_post_process_info(mem)); + } else { + ret_status = mem->Add( + sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */, + concurrent_memtable_writes_, get_post_process_info(mem)); } } + if (UNLIKELY(ret_status.IsTryAgain())) { + assert(seq_per_batch_); + const bool kBatchBoundary = true; + MaybeAdvanceSeq(kBatchBoundary); + } else if (ret_status.ok()) { + MaybeAdvanceSeq(); + CheckMemtableFull(); + } // optimize for non-recovery mode - if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) { + // If `ret_status` is `TryAgain` then the next (successful) try will add + // the key to the rebuilding transaction object. If `ret_status` is + // another non-OK `Status`, then the `rebuilding_trx_` will be thrown + // away. So we only need to add to it when `ret_status.ok()`. + if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) { assert(!write_after_commit_); - // If the ret_status is TryAgain then let the next try to add the ky to - // the rebuilding transaction object. - WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); + // TODO(ajkr): propagate `ProtectionInfoKVOTS64`. + ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, + key, value); } - MaybeAdvanceSeq(); - CheckMemtableFull(); return ret_status; } Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { - // Same as PutCF except for value type. - return PutCFImpl(column_family_id, key, value, kTypeBlobIndex); + const auto* kv_prot_info = NextProtectionInfo(); + if (kv_prot_info != nullptr) { + // Memtable needs seqno, doesn't need CF ID + auto mem_kv_prot_info = + kv_prot_info->StripC(column_family_id).ProtectS(sequence_); + // Same as PutCF except for value type. + return PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + &mem_kv_prot_info); + } else { + return PutCFImpl(column_family_id, key, value, kTypeBlobIndex, + nullptr /* kv_prot_info */); + } } void CheckMemtableFull() { @@ -2000,8 +2323,8 @@ Status WriteBatchInternal::InsertInto( MemTableInserter inserter( sequence, memtables, flush_scheduler, trim_history_scheduler, ignore_missing_column_families, recovery_log_number, db, - concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, - batch_per_txn); + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn); for (auto w : write_group) { if (w->CallbackFailed()) { continue; @@ -2014,6 +2337,7 @@ Status WriteBatchInternal::InsertInto( } SetSequence(w->batch, inserter.sequence()); inserter.set_log_number_ref(w->log_ref); + inserter.set_prot_info(w->batch->prot_info_.get()); w->status = w->batch->Iterate(&inserter); if (!w->status.ok()) { return w->status; @@ -2035,13 +2359,15 @@ Status WriteBatchInternal::InsertInto( (void)batch_cnt; #endif assert(writer->ShouldWriteToMemtable()); - MemTableInserter inserter( - sequence, memtables, flush_scheduler, trim_history_scheduler, - ignore_missing_column_families, log_number, db, - concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, - batch_per_txn, hint_per_batch); + MemTableInserter inserter(sequence, memtables, flush_scheduler, + trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, nullptr /* prot_info */, + nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn, hint_per_batch); SetSequence(writer->batch, sequence); inserter.set_log_number_ref(writer->log_ref); + inserter.set_prot_info(writer->batch->prot_info_.get()); Status s = writer->batch->Iterate(&inserter); assert(!seq_per_batch || batch_cnt != 0); assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt); @@ -2061,8 +2387,8 @@ Status WriteBatchInternal::InsertInto( MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler, trim_history_scheduler, ignore_missing_column_families, log_number, db, - concurrent_memtable_writes, has_valid_writes, - seq_per_batch, batch_per_txn); + concurrent_memtable_writes, batch->prot_info_.get(), + has_valid_writes, seq_per_batch, batch_per_txn); Status s = batch->Iterate(&inserter); if (next_seq != nullptr) { *next_seq = inserter.sequence(); @@ -2075,6 +2401,7 @@ Status WriteBatchInternal::InsertInto( Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { assert(contents.size() >= WriteBatchInternal::kHeader); + assert(b->prot_info_ == nullptr); b->rep_.assign(contents.data(), contents.size()); b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed); return Status::OK(); @@ -2082,6 +2409,8 @@ Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src, const bool wal_only) { + assert(dst->Count() == 0 || + (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr)); size_t src_len; int src_count; uint32_t src_flags; @@ -2098,6 +2427,13 @@ Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src, src_flags = src->content_flags_.load(std::memory_order_relaxed); } + if (dst->prot_info_ != nullptr) { + std::copy(src->prot_info_->entries_.begin(), + src->prot_info_->entries_.begin() + src_count, + std::back_inserter(dst->prot_info_->entries_)); + } else if (src->prot_info_ != nullptr) { + dst->prot_info_.reset(new WriteBatch::ProtectionInfo(*src->prot_info_)); + } SetCount(dst, Count(dst) + src_count); assert(src->rep_.size() >= WriteBatchInternal::kHeader); dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len); diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 30c48996516..fa863a1d62c 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -9,7 +9,9 @@ #pragma once #include + #include "db/flush_scheduler.h" +#include "db/kv_checksum.h" #include "db/trim_history_scheduler.h" #include "db/write_thread.h" #include "rocksdb/db.h" @@ -61,6 +63,14 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { MemTable* mem_; }; +struct WriteBatch::ProtectionInfo { + // `WriteBatch` usually doesn't contain a huge number of keys so protecting + // with a fixed, non-configurable eight bytes per key may work well enough. + autovector entries_; + + size_t GetBytesPerKey() const { return 8; } +}; + // WriteBatchInternal provides static methods for manipulating a // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { @@ -232,6 +242,9 @@ class LocalSavePoint { if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) { batch_->rep_.resize(savepoint_.size); WriteBatchInternal::SetCount(batch_, savepoint_.count); + if (batch_->prot_info_ != nullptr) { + batch_->prot_info_->entries_.resize(savepoint_.count); + } batch_->content_flags_.store(savepoint_.content_flags, std::memory_order_relaxed); return Status::MemoryLimit(); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 5a210b3d6dd..e4043cc406e 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -7,12 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "rocksdb/db.h" - #include + #include "db/column_family.h" +#include "db/db_test_util.h" #include "db/memtable.h" #include "db/write_batch_internal.h" +#include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/write_batch_with_index.h" @@ -23,12 +24,16 @@ namespace ROCKSDB_NAMESPACE { -static std::string PrintContents(WriteBatch* b) { +static std::string PrintContents(WriteBatch* b, + bool merge_operator_supported = true) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); Options options; options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); + if (merge_operator_supported) { + options.merge_operator.reset(new TestPutOperator()); + } + ImmutableOptions ioptions(options); WriteBufferManager wb(options.db_write_buffer_size); MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, kMaxSequenceNumber, 0 /* column_family_id */); @@ -59,10 +64,11 @@ static std::string PrintContents(WriteBatch* b) { if (iter == nullptr) { continue; } + EXPECT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; ikey.clear(); - EXPECT_OK(ParseInternalKey(iter->key(), &ikey)); + EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */)); switch (ikey.type) { case kTypeValue: state.append("Put("); @@ -112,16 +118,19 @@ static std::string PrintContents(WriteBatch* b) { state.append("@"); state.append(NumberToString(ikey.sequence)); } + EXPECT_OK(iter->status()); } - EXPECT_EQ(b->HasPut(), put_count > 0); - EXPECT_EQ(b->HasDelete(), delete_count > 0); - EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); - EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); - EXPECT_EQ(b->HasMerge(), merge_count > 0); - if (!s.ok()) { + if (s.ok()) { + EXPECT_EQ(b->HasPut(), put_count > 0); + EXPECT_EQ(b->HasDelete(), delete_count > 0); + EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0); + EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0); + EXPECT_EQ(b->HasMerge(), merge_count > 0); + if (count != WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + } else { state.append(s.ToString()); - } else if (count != WriteBatchInternal::Count(b)) { - state.append("CountMismatch()"); } delete mem->Unref(); return state; @@ -138,10 +147,10 @@ TEST_F(WriteBatchTest, Empty) { TEST_F(WriteBatchTest, Multiple) { WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - batch.DeleteRange(Slice("bar"), Slice("foo")); - batch.Put(Slice("baz"), Slice("boo")); + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Delete(Slice("box"))); + ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo"))); + ASSERT_OK(batch.Put(Slice("baz"), Slice("boo"))); WriteBatchInternal::SetSequence(&batch, 100); ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); ASSERT_EQ(4u, WriteBatchInternal::Count(&batch)); @@ -156,12 +165,12 @@ TEST_F(WriteBatchTest, Multiple) { TEST_F(WriteBatchTest, Corruption) { WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Delete(Slice("box"))); WriteBatchInternal::SetSequence(&batch, 200); Slice contents = WriteBatchInternal::Contents(&batch); - WriteBatchInternal::SetContents(&batch, - Slice(contents.data(),contents.size()-1)); + ASSERT_OK(WriteBatchInternal::SetContents( + &batch, Slice(contents.data(), contents.size() - 1))); ASSERT_EQ("Put(foo, bar)@200" "Corruption: bad WriteBatch Delete", PrintContents(&batch)); @@ -171,24 +180,24 @@ TEST_F(WriteBatchTest, Append) { WriteBatch b1, b2; WriteBatchInternal::SetSequence(&b1, 200); WriteBatchInternal::SetSequence(&b2, 300); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("", PrintContents(&b1)); ASSERT_EQ(0u, b1.Count()); - b2.Put("a", "va"); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(b2.Put("a", "va")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("Put(a, va)@200", PrintContents(&b1)); ASSERT_EQ(1u, b1.Count()); b2.Clear(); - b2.Put("b", "vb"); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(b2.Put("b", "vb")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("Put(a, va)@200" "Put(b, vb)@201", PrintContents(&b1)); ASSERT_EQ(2u, b1.Count()); - b2.Delete("foo"); - WriteBatchInternal::Append(&b1, &b2); + ASSERT_OK(b2.Delete("foo")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); ASSERT_EQ("Put(a, va)@200" "Put(b, vb)@202" "Put(b, vb)@201" @@ -196,11 +205,11 @@ TEST_F(WriteBatchTest, Append) { PrintContents(&b1)); ASSERT_EQ(4u, b1.Count()); b2.Clear(); - b2.Put("c", "cc"); - b2.Put("d", "dd"); + ASSERT_OK(b2.Put("c", "cc")); + ASSERT_OK(b2.Put("d", "dd")); b2.MarkWalTerminationPoint(); - b2.Put("e", "ee"); - WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true); + ASSERT_OK(b2.Put("e", "ee")); + ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true)); ASSERT_EQ( "Put(a, va)@200" "Put(b, vb)@202" @@ -223,10 +232,10 @@ TEST_F(WriteBatchTest, SingleDeletion) { WriteBatchInternal::SetSequence(&batch, 100); ASSERT_EQ("", PrintContents(&batch)); ASSERT_EQ(0u, batch.Count()); - batch.Put("a", "va"); + ASSERT_OK(batch.Put("a", "va")); ASSERT_EQ("Put(a, va)@100", PrintContents(&batch)); ASSERT_EQ(1u, batch.Count()); - batch.SingleDelete("a"); + ASSERT_OK(batch.SingleDelete("a")); ASSERT_EQ( "SingleDelete(a)@101" "Put(a, va)@100", @@ -316,7 +325,7 @@ namespace { TEST_F(WriteBatchTest, PutNotImplemented) { WriteBatch batch; - batch.Put(Slice("k1"), Slice("v1")); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch)); @@ -326,7 +335,7 @@ TEST_F(WriteBatchTest, PutNotImplemented) { TEST_F(WriteBatchTest, DeleteNotImplemented) { WriteBatch batch; - batch.Delete(Slice("k2")); + ASSERT_OK(batch.Delete(Slice("k2"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Delete(k2)@0", PrintContents(&batch)); @@ -336,7 +345,7 @@ TEST_F(WriteBatchTest, DeleteNotImplemented) { TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { WriteBatch batch; - batch.SingleDelete(Slice("k2")); + ASSERT_OK(batch.SingleDelete(Slice("k2"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch)); @@ -346,7 +355,7 @@ TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { TEST_F(WriteBatchTest, MergeNotImplemented) { WriteBatch batch; - batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch)); @@ -354,16 +363,26 @@ TEST_F(WriteBatchTest, MergeNotImplemented) { ASSERT_OK(batch.Iterate(&handler)); } +TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) { + WriteBatch batch; + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_EQ(1u, batch.Count()); + ASSERT_EQ( + "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator " + "!= nullptr`", + PrintContents(&batch, false /* merge_operator_supported */)); +} + TEST_F(WriteBatchTest, Blob) { WriteBatch batch; - batch.Put(Slice("k1"), Slice("v1")); - batch.Put(Slice("k2"), Slice("v2")); - batch.Put(Slice("k3"), Slice("v3")); - batch.PutLogData(Slice("blob1")); - batch.Delete(Slice("k2")); - batch.SingleDelete(Slice("k3")); - batch.PutLogData(Slice("blob2")); - batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + ASSERT_OK(batch.Put(Slice("k3"), Slice("v3"))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Delete(Slice("k2"))); + ASSERT_OK(batch.SingleDelete(Slice("k3"))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); ASSERT_EQ(6u, batch.Count()); ASSERT_EQ( "Merge(foo, bar)@5" @@ -375,7 +394,7 @@ TEST_F(WriteBatchTest, Blob) { PrintContents(&batch)); TestHandler handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "Put(k1, v1)" "Put(k2, v2)" @@ -390,19 +409,19 @@ TEST_F(WriteBatchTest, Blob) { TEST_F(WriteBatchTest, PrepareCommit) { WriteBatch batch; - WriteBatchInternal::InsertNoop(&batch); - batch.Put(Slice("k1"), Slice("v1")); - batch.Put(Slice("k2"), Slice("v2")); + ASSERT_OK(WriteBatchInternal::InsertNoop(&batch)); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); batch.SetSavePoint(); - WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")); + ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"))); Status s = batch.RollbackToSavePoint(); ASSERT_EQ(s, Status::NotFound()); - WriteBatchInternal::MarkCommit(&batch, Slice("xid1")); - WriteBatchInternal::MarkRollback(&batch, Slice("xid1")); + ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1"))); + ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1"))); ASSERT_EQ(2u, batch.Count()); TestHandler handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "MarkBeginPrepare(false)" "Put(k1, v1)" @@ -430,7 +449,7 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { raw[0] = c; raw[raw.length() - 1] = c; c++; - batch.Put(raw, raw); + ASSERT_OK(batch.Put(raw, raw)); } ASSERT_EQ(kNumUpdates, batch.Count()); @@ -472,7 +491,7 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { bool Continue() override { return num_seen < kNumUpdates; } } handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ(kNumUpdates, handler.num_seen); } @@ -486,7 +505,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { for (char i = 0; i < 2; i++) { raw[0] = 'A' + i; raw[raw.length() - 1] = 'A' - i; - batch.Put(raw, raw); + ASSERT_OK(batch.Put(raw, raw)); } ASSERT_EQ(2u, batch.Count()); @@ -523,7 +542,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { bool Continue() override { return num_seen < 2; } } handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ(2, handler.num_seen); } @@ -558,14 +577,14 @@ TEST_F(WriteBatchTest, Continue) { bool Continue() override { return num_seen < 5; } } handler; - batch.Put(Slice("k1"), Slice("v1")); - batch.Put(Slice("k2"), Slice("v2")); - batch.PutLogData(Slice("blob1")); - batch.Delete(Slice("k1")); - batch.SingleDelete(Slice("k2")); - batch.PutLogData(Slice("blob2")); - batch.Merge(Slice("foo"), Slice("bar")); - batch.Iterate(&handler); + ASSERT_OK(batch.Put(Slice("k1"), Slice("v1"))); + ASSERT_OK(batch.Put(Slice("k2"), Slice("v2"))); + ASSERT_OK(batch.PutLogData(Slice("blob1"))); + ASSERT_OK(batch.Delete(Slice("k1"))); + ASSERT_OK(batch.SingleDelete(Slice("k2"))); + ASSERT_OK(batch.PutLogData(Slice("blob2"))); + ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "Put(k1, v1)" "Put(k2, v2)" @@ -577,22 +596,22 @@ TEST_F(WriteBatchTest, Continue) { TEST_F(WriteBatchTest, PutGatherSlices) { WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); + ASSERT_OK(batch.Put(Slice("foo"), Slice("bar"))); { // Try a write where the key is one slice but the value is two Slice key_slice("baz"); Slice value_slices[2] = { Slice("header"), Slice("payload") }; - batch.Put(SliceParts(&key_slice, 1), - SliceParts(value_slices, 2)); + ASSERT_OK( + batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2))); } { // One where the key is composite but the value is a single slice Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; Slice value_slice("value"); - batch.Put(SliceParts(key_slices, 3), - SliceParts(&value_slice, 1)); + ASSERT_OK( + batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1))); } WriteBatchInternal::SetSequence(&batch, 100); @@ -621,18 +640,18 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { WriteBatch batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Put(&two, Slice("twofoo"), Slice("bar2")); - batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); - batch.Delete(&eight, Slice("eightfoo")); - batch.SingleDelete(&two, Slice("twofoo")); - batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")); - batch.Merge(&three, Slice("threethree"), Slice("3three")); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Merge(Slice("omom"), Slice("nom")); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2"))); + ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8"))); + ASSERT_OK(batch.Delete(&eight, Slice("eightfoo"))); + ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo"))); + ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"))); + ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); TestHandler handler; - batch.Iterate(&handler); + ASSERT_OK(batch.Iterate(&handler)); ASSERT_EQ( "Put(foo, bar)" "PutCF(2, twofoo, bar2)" @@ -650,14 +669,14 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { WriteBatchWithIndex batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Put(&two, Slice("twofoo"), Slice("bar2")); - batch.Put(&eight, Slice("eightfoo"), Slice("bar8")); - batch.Delete(&eight, Slice("eightfoo")); - batch.SingleDelete(&two, Slice("twofoo")); - batch.Merge(&three, Slice("threethree"), Slice("3three")); - batch.Put(&zero, Slice("foo"), Slice("bar")); - batch.Merge(Slice("omom"), Slice("nom")); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2"))); + ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8"))); + ASSERT_OK(batch.Delete(&eight, Slice("eightfoo"))); + ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo"))); + ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); + ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); + ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); std::unique_ptr iter; @@ -736,7 +755,7 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { ASSERT_TRUE(!iter->Valid()); TestHandler handler; - batch.GetWriteBatch()->Iterate(&handler); + ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler)); ASSERT_EQ( "Put(foo, bar)" "PutCF(2, twofoo, bar2)" @@ -755,12 +774,12 @@ TEST_F(WriteBatchTest, SavePointTest) { WriteBatch batch; batch.SetSavePoint(); - batch.Put("A", "a"); - batch.Put("B", "b"); + ASSERT_OK(batch.Put("A", "a")); + ASSERT_OK(batch.Put("B", "b")); batch.SetSavePoint(); - batch.Put("C", "c"); - batch.Delete("A"); + ASSERT_OK(batch.Put("C", "c")); + ASSERT_OK(batch.Delete("A")); batch.SetSavePoint(); batch.SetSavePoint(); @@ -779,8 +798,8 @@ TEST_F(WriteBatchTest, SavePointTest) { "Put(B, b)@1", PrintContents(&batch)); - batch.Delete("A"); - batch.Put("B", "bb"); + ASSERT_OK(batch.Delete("A")); + ASSERT_OK(batch.Put("B", "bb")); ASSERT_OK(batch.RollbackToSavePoint()); ASSERT_EQ("", PrintContents(&batch)); @@ -789,12 +808,12 @@ TEST_F(WriteBatchTest, SavePointTest) { ASSERT_TRUE(s.IsNotFound()); ASSERT_EQ("", PrintContents(&batch)); - batch.Put("D", "d"); - batch.Delete("A"); + ASSERT_OK(batch.Put("D", "d")); + ASSERT_OK(batch.Delete("A")); batch.SetSavePoint(); - batch.Put("A", "aaa"); + ASSERT_OK(batch.Put("A", "aaa")); ASSERT_OK(batch.RollbackToSavePoint()); ASSERT_EQ( @@ -804,8 +823,8 @@ TEST_F(WriteBatchTest, SavePointTest) { batch.SetSavePoint(); - batch.Put("D", "d"); - batch.Delete("A"); + ASSERT_OK(batch.Put("D", "d")); + ASSERT_OK(batch.Delete("A")); ASSERT_OK(batch.RollbackToSavePoint()); ASSERT_EQ( @@ -826,7 +845,7 @@ TEST_F(WriteBatchTest, SavePointTest) { ASSERT_TRUE(s.IsNotFound()); ASSERT_EQ("", PrintContents(&batch2)); - batch2.Delete("A"); + ASSERT_OK(batch2.Delete("A")); batch2.SetSavePoint(); s = batch2.RollbackToSavePoint(); @@ -838,7 +857,7 @@ TEST_F(WriteBatchTest, SavePointTest) { batch2.SetSavePoint(); - batch2.Delete("B"); + ASSERT_OK(batch2.Delete("B")); ASSERT_EQ("Delete(B)@0", PrintContents(&batch2)); batch2.SetSavePoint(); @@ -861,7 +880,7 @@ TEST_F(WriteBatchTest, SavePointTest) { ASSERT_EQ("", PrintContents(&batch3)); batch3.SetSavePoint(); - batch3.Delete("A"); + ASSERT_OK(batch3.Delete("A")); s = batch3.PopSavePoint(); ASSERT_OK(s); diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index 4bfc4e911f4..a2d3f94c4bf 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -111,7 +111,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { void Put(const string& key, const string& val) { kvs_.push_back(std::make_pair(key, val)); - write_batch_.Put(key, val); + ASSERT_OK(write_batch_.Put(key, val)); } void Clear() { @@ -319,7 +319,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) { DBImpl* db_impl_; } publish_seq_callback(db_impl); // seq_per_batch_ requires a natural batch separator or Noop - WriteBatchInternal::InsertNoop(&write_op.write_batch_); + ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_)); const size_t ONE_BATCH = 1; s = db_impl->WriteImpl(woptions, &write_op.write_batch_, &write_op.callback_, nullptr, 0, false, nullptr, @@ -396,8 +396,8 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { WriteBatch wb; - wb.Put("a", "value.a"); - wb.Delete("x"); + ASSERT_OK(wb.Put("a", "value.a")); + ASSERT_OK(wb.Delete("x")); // Test a simple Write s = db->Write(write_options, &wb); @@ -411,7 +411,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { WriteCallbackTestWriteCallback1 callback1; WriteBatch wb2; - wb2.Put("a", "value.a2"); + ASSERT_OK(wb2.Put("a", "value.a2")); s = db_impl->WriteWithCallback(write_options, &wb2, &callback1); ASSERT_OK(s); @@ -425,7 +425,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { WriteCallbackTestWriteCallback2 callback2; WriteBatch wb3; - wb3.Put("a", "value.a3"); + ASSERT_OK(wb3.Put("a", "value.a3")); s = db_impl->WriteWithCallback(write_options, &wb3, &callback2); ASSERT_NOK(s); diff --git a/db/write_controller.cc b/db/write_controller.cc index 5480aabd149..c5f7443752f 100644 --- a/db/write_controller.cc +++ b/db/write_controller.cc @@ -5,10 +5,12 @@ #include "db/write_controller.h" +#include #include #include #include -#include "rocksdb/env.h" + +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -19,10 +21,14 @@ std::unique_ptr WriteController::GetStopToken() { std::unique_ptr WriteController::GetDelayToken( uint64_t write_rate) { - total_delayed_++; - // Reset counters. - last_refill_time_ = 0; - bytes_left_ = 0; + if (0 == total_delayed_++) { + // Starting delay, so reset counters. + next_refill_time_ = 0; + credit_in_bytes_ = 0; + } + // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in + // next_refill_time_ will be based on an old rate. This rate will apply + // for subsequent additional debts and for the next refill. set_delayed_write_rate(write_rate); return std::unique_ptr(new DelayWriteToken(this)); } @@ -42,7 +48,7 @@ bool WriteController::IsStopped() const { // If it turns out to be a performance issue, we can redesign the thread // synchronization model here. // The function trust caller will sleep micros returned. -uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) { +uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) { if (total_stopped_.load(std::memory_order_relaxed) > 0) { return 0; } @@ -50,64 +56,51 @@ uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) { return 0; } - const uint64_t kMicrosPerSecond = 1000000; - const uint64_t kRefillInterval = 1024U; - - if (bytes_left_ >= num_bytes) { - bytes_left_ -= num_bytes; + if (credit_in_bytes_ >= num_bytes) { + credit_in_bytes_ -= num_bytes; return 0; } // The frequency to get time inside DB mutex is less than one per refill // interval. - auto time_now = NowMicrosMonotonic(env); - - uint64_t sleep_debt = 0; - uint64_t time_since_last_refill = 0; - if (last_refill_time_ != 0) { - if (last_refill_time_ > time_now) { - sleep_debt = last_refill_time_ - time_now; - } else { - time_since_last_refill = time_now - last_refill_time_; - bytes_left_ += - static_cast(static_cast(time_since_last_refill) / - kMicrosPerSecond * delayed_write_rate_); - if (time_since_last_refill >= kRefillInterval && - bytes_left_ > num_bytes) { - // If refill interval already passed and we have enough bytes - // return without extra sleeping. - last_refill_time_ = time_now; - bytes_left_ -= num_bytes; - return 0; - } + auto time_now = NowMicrosMonotonic(clock); + + const uint64_t kMicrosPerSecond = 1000000; + // Refill every 1 ms + const uint64_t kMicrosPerRefill = 1000; + + if (next_refill_time_ == 0) { + // Start with an initial allotment of bytes for one interval + next_refill_time_ = time_now; + } + if (next_refill_time_ <= time_now) { + // Refill based on time interval plus any extra elapsed + uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill; + credit_in_bytes_ += static_cast( + 1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999); + next_refill_time_ = time_now + kMicrosPerRefill; + + if (credit_in_bytes_ >= num_bytes) { + // Avoid delay if possible, to reduce DB mutex release & re-aquire. + credit_in_bytes_ -= num_bytes; + return 0; } } - uint64_t single_refill_amount = - delayed_write_rate_ * kRefillInterval / kMicrosPerSecond; - if (bytes_left_ + single_refill_amount >= num_bytes) { - // Wait until a refill interval - // Never trigger expire for less than one refill interval to avoid to get - // time. - bytes_left_ = bytes_left_ + single_refill_amount - num_bytes; - last_refill_time_ = time_now + kRefillInterval; - return kRefillInterval + sleep_debt; - } + // We need to delay to avoid exceeding write rate. + assert(num_bytes > credit_in_bytes_); + uint64_t bytes_over_budget = num_bytes - credit_in_bytes_; + uint64_t needed_delay = static_cast( + 1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond); + + credit_in_bytes_ = 0; + next_refill_time_ += needed_delay; - // Need to refill more than one interval. Need to sleep longer. Check - // whether expiration will hit - - // Sleep just until `num_bytes` is allowed. - uint64_t sleep_amount = - static_cast(num_bytes / - static_cast(delayed_write_rate_) * - kMicrosPerSecond) + - sleep_debt; - last_refill_time_ = time_now + sleep_amount; - return sleep_amount; + // Minimum delay of refill interval, to reduce DB mutex contention. + return std::max(next_refill_time_ - time_now, kMicrosPerRefill); } -uint64_t WriteController::NowMicrosMonotonic(Env* env) { - return env->NowNanos() / std::milli::den; +uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) { + return clock->NowNanos() / std::milli::den; } StopWriteToken::~StopWriteToken() { diff --git a/db/write_controller.h b/db/write_controller.h index 785ae68965d..88bd1417f19 100644 --- a/db/write_controller.h +++ b/db/write_controller.h @@ -13,7 +13,7 @@ namespace ROCKSDB_NAMESPACE { -class Env; +class SystemClock; class WriteControllerToken; // WriteController is controlling write stalls in our write code-path. Write @@ -27,8 +27,8 @@ class WriteController { : total_stopped_(0), total_delayed_(0), total_compaction_pressure_(0), - bytes_left_(0), - last_refill_time_(0), + credit_in_bytes_(0), + next_refill_time_(0), low_pri_rate_limiter_( NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) { set_max_delayed_write_rate(_delayed_write_rate); @@ -57,7 +57,7 @@ class WriteController { // return how many microseconds the caller needs to sleep after the call // num_bytes: how many number of bytes to put into the DB. // Prerequisite: DB mutex held. - uint64_t GetDelay(Env* env, uint64_t num_bytes); + uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes); void set_delayed_write_rate(uint64_t write_rate) { // avoid divide 0 if (write_rate == 0) { @@ -85,7 +85,7 @@ class WriteController { RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); } private: - uint64_t NowMicrosMonotonic(Env* env); + uint64_t NowMicrosMonotonic(SystemClock* clock); friend class WriteControllerToken; friend class StopWriteToken; @@ -95,11 +95,14 @@ class WriteController { std::atomic total_stopped_; std::atomic total_delayed_; std::atomic total_compaction_pressure_; - uint64_t bytes_left_; - uint64_t last_refill_time_; - // write rate set when initialization or by `DBImpl::SetDBOptions` + + // Number of bytes allowed to write without delay + uint64_t credit_in_bytes_; + // Next time that we can add more credit of bytes + uint64_t next_refill_time_; + // Write rate set when initialization or by `DBImpl::SetDBOptions` uint64_t max_delayed_write_rate_; - // current write rate + // Current write rate (bytes / second) uint64_t delayed_write_rate_; std::unique_ptr low_pri_rate_limiter_; diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index 72d116798e8..1f7cf999aaf 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -3,128 +3,240 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include - #include "db/write_controller.h" -#include "rocksdb/env.h" +#include +#include + +#include "rocksdb/system_clock.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { - -class WriteControllerTest : public testing::Test {}; - -class TimeSetEnv : public EnvWrapper { +namespace { +class TimeSetClock : public SystemClockWrapper { public: - explicit TimeSetEnv() : EnvWrapper(nullptr) {} + explicit TimeSetClock() : SystemClockWrapper(nullptr) {} + const char* Name() const override { return "TimeSetClock"; } uint64_t now_micros_ = 6666; uint64_t NowNanos() override { return now_micros_ * std::milli::den; } }; +} // namespace +class WriteControllerTest : public testing::Test { + public: + WriteControllerTest() { clock_ = std::make_shared(); } + std::shared_ptr clock_; +}; -TEST_F(WriteControllerTest, ChangeDelayRateTest) { - TimeSetEnv env; - WriteController controller(40000000u); // also set max delayed rate - controller.set_delayed_write_rate(10000000u); - auto delay_token_0 = - controller.GetDelayToken(controller.delayed_write_rate()); - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(&env, 20000000u)); - auto delay_token_1 = controller.GetDelayToken(2000000u); - ASSERT_EQ(static_cast(10000000), - controller.GetDelay(&env, 20000000u)); - auto delay_token_2 = controller.GetDelayToken(1000000u); - ASSERT_EQ(static_cast(20000000), - controller.GetDelay(&env, 20000000u)); - auto delay_token_3 = controller.GetDelayToken(20000000u); - ASSERT_EQ(static_cast(1000000), - controller.GetDelay(&env, 20000000u)); - // This is more than max rate. Max delayed rate will be used. - auto delay_token_4 = - controller.GetDelayToken(controller.delayed_write_rate() * 3); - ASSERT_EQ(static_cast(500000), - controller.GetDelay(&env, 20000000u)); +// Make tests easier to read +#define MILLION *1000000u +#define MB MILLION +#define MBPS MILLION +#define SECS MILLION // in microseconds + +TEST_F(WriteControllerTest, BasicAPI) { + WriteController controller(40 MBPS); // also set max delayed rate + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + // set, get + controller.set_delayed_write_rate(20 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_FALSE(controller.NeedsDelay()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + + { + // set with token, get + auto delay_token_0 = controller.GetDelayToken(10 MBPS); + EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS); + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + // test with delay + EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 2 SECS; // pay the "debt" + + auto delay_token_1 = controller.GetDelayToken(2 MBPS); + EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 10 SECS; // pay the "debt" + + auto delay_token_2 = controller.GetDelayToken(1 MBPS); + EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 20 SECS; // pay the "debt" + + auto delay_token_3 = controller.GetDelayToken(20 MBPS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB)); + clock_->now_micros_ += 1 SECS; // pay the "debt" + + // 60M is more than the max rate of 40M. Max rate will be used. + EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS); + auto delay_token_4 = + controller.GetDelayToken(controller.delayed_write_rate() * 3); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + EXPECT_EQ(static_cast(0.5 SECS), + controller.GetDelay(clock_.get(), 20 MB)); + + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + + // Test stop tokens + { + auto stop_token_1 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + { + auto stop_token_2 = controller.GetStopToken(); + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + EXPECT_TRUE(controller.IsStopped()); + EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB)); + } + // Stop tokens released + EXPECT_FALSE(controller.IsStopped()); + EXPECT_TRUE(controller.NeedsDelay()); + EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS); + // pay the previous "debt" + clock_->now_micros_ += static_cast(0.5 SECS); + EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB)); + } + + // Delay tokens released + EXPECT_FALSE(controller.NeedsDelay()); } -TEST_F(WriteControllerTest, SanityTest) { - WriteController controller(10000000u); - auto stop_token_1 = controller.GetStopToken(); - auto stop_token_2 = controller.GetStopToken(); - - ASSERT_TRUE(controller.IsStopped()); - stop_token_1.reset(); - ASSERT_TRUE(controller.IsStopped()); - stop_token_2.reset(); - ASSERT_FALSE(controller.IsStopped()); - - TimeSetEnv env; - - auto delay_token_1 = controller.GetDelayToken(10000000u); - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(&env, 20000000u)); - - env.now_micros_ += 1999900u; // sleep debt 1000 +TEST_F(WriteControllerTest, StartFilled) { + WriteController controller(10 MBPS); - auto delay_token_2 = controller.GetDelayToken(10000000u); - // Rate reset after changing the token. - ASSERT_EQ(static_cast(2000000), - controller.GetDelay(&env, 20000000u)); - - env.now_micros_ += 1999900u; // sleep debt 1000 - - // One refill: 10240 bytes allowed, 1000 used, 9240 left - ASSERT_EQ(static_cast(1124), controller.GetDelay(&env, 1000u)); - env.now_micros_ += 1124u; // sleep debt 0 - - delay_token_2.reset(); - // 1000 used, 8240 left - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); - - env.now_micros_ += 100u; // sleep credit 100 - // 1000 used, 7240 left - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); - - env.now_micros_ += 100u; // sleep credit 200 - // One refill: 10240 fileed, sleep credit generates 2000. 8000 used - // 7240 + 10240 + 2000 - 8000 = 11480 left - ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 8000u)); - - env.now_micros_ += 200u; // sleep debt 824 - // 1000 used, 10480 left. - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 1000u)); - - env.now_micros_ += 200u; // sleep debt 624 - // Out of bound sleep, still 10480 left - ASSERT_EQ(static_cast(3000624u), - controller.GetDelay(&env, 30000000u)); - - env.now_micros_ += 3000724u; // sleep credit 100 - // 6000 used, 4480 left. - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 6000u)); - - env.now_micros_ += 200u; // sleep credit 300 - // One refill, credit 4480 balance + 3000 credit + 10240 refill - // Use 8000, 9720 left - ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 8000u)); + // Attempt to write two things that combined would be allowed within + // a single refill interval + auto delay_token_0 = + controller.GetDelayToken(controller.delayed_write_rate()); - env.now_micros_ += 3024u; // sleep credit 2000 + // Verify no delay because write rate has not been exceeded within + // refill interval. + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); - // 1720 left - ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); + // Allow refill (kMicrosPerRefill) + clock_->now_micros_ += 1000; - // 1720 balance + 20000 credit = 20170 left - // Use 8000, 12170 left - ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); + // Again + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/)); - // 4170 left - ASSERT_EQ(static_cast(0u), controller.GetDelay(&env, 8000u)); + // Control: something bigger that would exceed write rate within interval + uint64_t delay = controller.GetDelay(clock_.get(), 10 MB); + EXPECT_GT(1.0 * delay, 0.999 SECS); + EXPECT_LT(1.0 * delay, 1.001 SECS); +} - // Need a refill - ASSERT_EQ(static_cast(1024u), controller.GetDelay(&env, 9000u)); +TEST_F(WriteControllerTest, DebtAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Accumulate a time delay debt with no passage of time, like many column + // families delaying writes simultaneously. (Old versions of WriteController + // would reset the debt on every GetDelayToken.) + uint64_t debt = 0; + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; + + // Now accumulate debt with some passage of time. + for (unsigned i = 0; i < tokens.size(); ++i) { + // Debt is accumulated in time, not in bytes, so this new write + // limit is not applied to prior requested delays, even it they are + // in progress. + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + uint64_t delay = controller.GetDelay(clock_.get(), 63 MB); + ASSERT_GT(delay, debt); + uint64_t incremental = delay - debt; + ASSERT_EQ(incremental, (63 SECS) / (i + 1u)); + debt += incremental; + uint64_t credit = debt / 2; + clock_->now_micros_ += credit; + debt -= credit; + } + + // Pay down the debt + clock_->now_micros_ += debt; + debt = 0; // consistent state + (void)debt; // appease clang-analyze + + // Verify paid down + EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + + // Accumulate another debt, without accounting, and releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + // Big and small are delayed + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB)); + ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); + tokens[i].reset(); + } + // All tokens released. + // Verify that releasing all tokens pays down debt, even with no time passage. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/)); +} - delay_token_1.reset(); - ASSERT_EQ(static_cast(0), controller.GetDelay(&env, 30000000u)); - delay_token_1.reset(); - ASSERT_FALSE(controller.IsStopped()); +// This may or may not be a "good" feature, but it's an old feature +TEST_F(WriteControllerTest, CreditAccumulation) { + WriteController controller(10 MBPS); + + std::array, 10> tokens; + + // Ensure started + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit + uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */; + clock_->now_micros_ += credit; + + // Spend some credit (burst of I/O) + for (unsigned i = 0; i < tokens.size(); ++i) { + tokens[i] = controller.GetDelayToken((i + 1u) MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB)); + // In WriteController, credit is accumulated in bytes, not in time. + // After an "unnecessary" delay, all of our time credit will be + // translated to bytes on the next operation, in this case with + // setting 1 MBPS. So regardless of the rate at delay time, we just + // account for the bytes. + credit -= 63 MB; + } + // Spend remaining credit + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit)); + // Verify + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); + clock_->now_micros_ += 10 SECS; + + // Accumulate a credit, no accounting + clock_->now_micros_ += 1000 SECS; + + // Spend a small amount, releasing tokens + for (unsigned i = 0; i < tokens.size(); ++i) { + ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB)); + tokens[i].reset(); + } + + // All tokens released. + // Verify credit is wiped away on new delay. + tokens[0] = controller.GetDelayToken(1 MBPS); + ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB)); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/write_thread.cc b/db/write_thread.cc index d26a694aac7..ac3a2f86915 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -208,6 +208,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask, } void WriteThread::SetState(Writer* w, uint8_t new_state) { + assert(w); auto state = w->state.load(std::memory_order_acquire); if (state == STATE_LOCKED_WAITING || !w->state.compare_exchange_strong(state, new_state)) { @@ -240,6 +241,7 @@ bool WriteThread::LinkOne(Writer* w, std::atomic* newest_writer) { MutexLock lock(&stall_mu_); writers = newest_writer->load(std::memory_order_relaxed); if (writers == &write_stall_dummy_) { + TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w); stall_cv_.Wait(); // Load newest_writers_ again since it may have changed writers = newest_writer->load(std::memory_order_relaxed); @@ -464,6 +466,11 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, break; } + if (w->protection_bytes_per_key != leader->protection_bytes_per_key) { + // Do not mix writes with different levels of integrity protection. + break; + } + if (w->batch == nullptr) { // Do not include those writes with nullptr batch. Those are not writes, // those are something else. They want to be alone diff --git a/db/write_thread.h b/db/write_thread.h index 41cb9842c90..b050606aa54 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -5,12 +5,11 @@ #pragma once -#include -#include - #include +#include #include #include +#include #include #include #include @@ -37,7 +36,7 @@ class WriteThread { // non-parallel informs a follower that its writes have been committed // (-> STATE_COMPLETED), or when a leader that has chosen to perform // updates in parallel and needs this Writer to apply its batch (-> - // STATE_PARALLEL_FOLLOWER). + // STATE_PARALLEL_MEMTABLE_WRITER). STATE_INIT = 1, // The state used to inform a waiting Writer that it has become the @@ -76,7 +75,6 @@ class WriteThread { struct Writer; struct WriteGroup { - ~WriteGroup() { status.PermitUncheckedError(); } Writer* leader = nullptr; Writer* last_writer = nullptr; SequenceNumber last_sequence; @@ -121,6 +119,7 @@ class WriteThread { bool disable_wal; bool disable_memtable; size_t batch_cnt; // if non-zero, number of sub-batches in the write batch + size_t protection_bytes_per_key; PreReleaseCallback* pre_release_callback; uint64_t log_used; // log number that this batch was inserted into uint64_t log_ref; // log number that memtable insert should reference @@ -130,7 +129,7 @@ class WriteThread { WriteGroup* write_group; SequenceNumber sequence; // the sequence number to use for the first key Status status; - Status callback_status; // status returned by callback->Callback() + Status callback_status; // status returned by callback->Callback() std::aligned_storage::type state_mutex_bytes; std::aligned_storage::type state_cv_bytes; @@ -144,6 +143,7 @@ class WriteThread { disable_wal(false), disable_memtable(false), batch_cnt(0), + protection_bytes_per_key(0), pre_release_callback(nullptr), log_used(0), log_ref(0), @@ -165,6 +165,7 @@ class WriteThread { disable_wal(write_options.disableWAL), disable_memtable(_disable_memtable), batch_cnt(_batch_cnt), + protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), pre_release_callback(_pre_release_callback), log_used(0), log_ref(_log_ref), @@ -245,7 +246,7 @@ class WriteThread { std::condition_variable& StateCV() { assert(made_waitable); return *static_cast( - static_cast(&state_cv_bytes)); + static_cast(&state_cv_bytes)); } }; @@ -272,7 +273,7 @@ class WriteThread { // STATE_GROUP_LEADER. If w has been made part of a sequential batch // group and the leader has performed the write, returns STATE_DONE. // If w has been made part of a parallel batch group and is responsible - // for updating the memtable, returns STATE_PARALLEL_FOLLOWER. + // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER. // // The db mutex SHOULD NOT be held when calling this function, because // it will block. @@ -309,8 +310,8 @@ class WriteThread { // the next leader if needed. void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group); - // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the - // non-leader members of this write batch group. Sets Writer::sequence + // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of + // the non-leader members of this write batch group. Sets Writer::sequence // before waking them up. // // WriteGroup* write_group: Extra state used to coordinate the parallel add diff --git a/db_stress_tool/batched_ops_stress.cc b/db_stress_tool/batched_ops_stress.cc index ea1fab5696d..db81eb2bbbb 100644 --- a/db_stress_tool/batched_ops_stress.cc +++ b/db_stress_tool/batched_ops_stress.cc @@ -31,7 +31,8 @@ class BatchedOpsStressTest : public StressTest { std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"}; std::string values[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"}; Slice value_slices[10]; - WriteBatch batch; + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, 0 /* ts_sz */, + FLAGS_batch_protection_bytes_per_key); Status s; auto cfh = column_families_[rand_column_families[0]]; std::string key_str = Key(rand_keys[0]); @@ -66,7 +67,8 @@ class BatchedOpsStressTest : public StressTest { std::unique_ptr& /* lock */) override { std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"}; - WriteBatch batch; + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, 0 /* ts_sz */, + FLAGS_batch_protection_bytes_per_key); Status s; auto cfh = column_families_[rand_column_families[0]]; std::string key_str = Key(rand_keys[0]); diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index f1823302a29..eccb9b554d5 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -16,10 +16,10 @@ #include "util/file_checksum_helper.h" #include "util/xxhash.h" -ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env = nullptr; +ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr; #ifndef NDEBUG // If non-null, injects read error at a rate specified by the -// read_fault_one_in flag +// read_fault_one_in or write_fault_one_in flag std::shared_ptr fault_fs_guard; #endif // NDEBUG enum ROCKSDB_NAMESPACE::CompressionType compression_type_e = @@ -30,7 +30,7 @@ enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e = ROCKSDB_NAMESPACE::kCRC32c; enum RepFactory FLAGS_rep_factory = kSkipList; std::vector sum_probs(100001); -int64_t zipf_sum_size = 100000; +constexpr int64_t zipf_sum_size = 100000; namespace ROCKSDB_NAMESPACE { @@ -160,8 +160,10 @@ void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) { snprintf(buf, 4, "%X", value[i]); tmp.append(buf); } - fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf, - key, sz, tmp.c_str()); + auto key_str = Key(key); + Slice key_slice = key_str; + fprintf(stdout, "[CF %d] %s (%" PRIi64 ") == > (%" ROCKSDB_PRIszt ") %s\n", + cf, key_slice.ToString(true).c_str(), key, sz, tmp.c_str()); } // Note that if hot_key_alpha != 0, it generates the key based on Zipfian @@ -231,6 +233,15 @@ size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) { return value_sz; // the size of the value set. } +std::string NowNanosStr() { + uint64_t t = db_stress_env->NowNanos(); + std::string ret; + PutFixed64(&ret, t); + return ret; +} + +std::string GenerateTimestampForRead() { return NowNanosStr(); } + namespace { class MyXXH64Checksum : public FileChecksumGenerator { diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 67d4530c5ce..a7476594278 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -132,6 +132,9 @@ DECLARE_int32(set_options_one_in); DECLARE_int32(set_in_place_one_in); DECLARE_int64(cache_size); DECLARE_bool(cache_index_and_filter_blocks); +DECLARE_int32(top_level_index_pinning); +DECLARE_int32(partition_pinning); +DECLARE_int32(unpartitioned_pinning); DECLARE_bool(use_clock_cache); DECLARE_uint64(subcompactions); DECLARE_uint64(periodic_compaction_seconds); @@ -141,6 +144,7 @@ DECLARE_bool(enable_write_thread_adaptive_yield); DECLARE_int32(reopen); DECLARE_double(bloom_bits); DECLARE_bool(use_block_based_filter); +DECLARE_bool(use_ribbon_filter); DECLARE_bool(partition_filters); DECLARE_bool(optimize_filters_for_memory); DECLARE_int32(index_type); @@ -202,6 +206,7 @@ DECLARE_string(bottommost_compression_type); DECLARE_int32(compression_max_dict_bytes); DECLARE_int32(compression_zstd_max_train_bytes); DECLARE_int32(compression_parallel_threads); +DECLARE_uint64(compression_max_dict_buffer_bytes); DECLARE_string(checksum_type); DECLARE_string(hdfs); DECLARE_string(env_uri); @@ -228,6 +233,7 @@ DECLARE_int32(get_property_one_in); DECLARE_string(file_checksum_impl); #ifndef ROCKSDB_LITE +// Options for StackableDB-based BlobDB DECLARE_bool(use_blob_db); DECLARE_uint64(blob_db_min_blob_size); DECLARE_uint64(blob_db_bytes_per_sync); @@ -235,6 +241,16 @@ DECLARE_uint64(blob_db_file_size); DECLARE_bool(blob_db_enable_gc); DECLARE_double(blob_db_gc_cutoff); #endif // !ROCKSDB_LITE + +// Options for integrated BlobDB +DECLARE_bool(allow_setting_blob_options_dynamically); +DECLARE_bool(enable_blob_files); +DECLARE_uint64(min_blob_size); +DECLARE_uint64(blob_file_size); +DECLARE_string(blob_compression_type); +DECLARE_bool(enable_blob_garbage_collection); +DECLARE_double(blob_garbage_collection_age_cutoff); + DECLARE_int32(approximate_size_one_in); DECLARE_bool(sync_fault_injection); @@ -242,13 +258,17 @@ DECLARE_bool(best_efforts_recovery); DECLARE_bool(skip_verifydb); DECLARE_bool(enable_compaction_filter); DECLARE_bool(paranoid_file_checks); +DECLARE_bool(fail_if_options_file_error); +DECLARE_uint64(batch_protection_bytes_per_key); -const long KB = 1024; -const int kRandomValueMaxFactor = 3; -const int kValueMaxLen = 100; +DECLARE_uint64(user_timestamp_size); + +constexpr long KB = 1024; +constexpr int kRandomValueMaxFactor = 3; +constexpr int kValueMaxLen = 100; // wrapped posix or hdfs environment -extern ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env; +extern ROCKSDB_NAMESPACE::Env* db_stress_env; #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { class FaultInjectionTestFS; @@ -544,6 +564,9 @@ extern StressTest* CreateNonBatchedOpsStressTest(); extern void InitializeHotKeyGenerator(double alpha); extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key); +extern std::string GenerateTimestampForRead(); +extern std::string NowNanosStr(); + std::shared_ptr GetFileChecksumImpl( const std::string& name); } // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 69411aa2936..b944214ec6d 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -57,6 +57,7 @@ void ThreadBody(void* v) { } bool RunStressTest(StressTest* stress) { + SystemClock* clock = db_stress_env->GetSystemClock().get(); stress->InitDb(); SharedState shared(db_stress_env, stress); stress->FinishInitDb(&shared); @@ -69,9 +70,9 @@ bool RunStressTest(StressTest* stress) { uint32_t n = shared.GetNumThreads(); - uint64_t now = db_stress_env->NowMicros(); + uint64_t now = clock->NowMicros(); fprintf(stdout, "%s Initializing worker threads\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock->TimeToString(now / 1000000).c_str()); std::vector threads(n); for (uint32_t i = 0; i < n; i++) { threads[i] = new ThreadState(i, &shared); @@ -104,9 +105,9 @@ bool RunStressTest(StressTest* stress) { } } - now = db_stress_env->NowMicros(); + now = clock->NowMicros(); fprintf(stdout, "%s Starting database operations\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock->TimeToString(now / 1000000).c_str()); shared.SetStart(); shared.GetCondVar()->SignalAll(); @@ -114,16 +115,16 @@ bool RunStressTest(StressTest* stress) { shared.GetCondVar()->Wait(); } - now = db_stress_env->NowMicros(); + now = clock->NowMicros(); if (FLAGS_test_batches_snapshots) { fprintf(stdout, "%s Limited verification already done during gets\n", - db_stress_env->TimeToString((uint64_t)now / 1000000).c_str()); + clock->TimeToString((uint64_t)now / 1000000).c_str()); } else if (FLAGS_skip_verifydb) { fprintf(stdout, "%s Verification skipped\n", - db_stress_env->TimeToString((uint64_t)now / 1000000).c_str()); + clock->TimeToString((uint64_t)now / 1000000).c_str()); } else { fprintf(stdout, "%s Starting verification\n", - db_stress_env->TimeToString((uint64_t)now / 1000000).c_str()); + clock->TimeToString((uint64_t)now / 1000000).c_str()); } shared.SetStartVerify(); @@ -142,11 +143,11 @@ bool RunStressTest(StressTest* stress) { delete threads[i]; threads[i] = nullptr; } - now = db_stress_env->NowMicros(); + now = clock->NowMicros(); if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) { fprintf(stdout, "%s Verification successful\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock->TimeToString(now / 1000000).c_str()); } stress->PrintStatistics(); diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index 484071f1067..f517a489b06 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -28,7 +28,9 @@ class DbStressEnvWrapper : public EnvWrapper { f.find(".restore") != std::string::npos) { return target()->DeleteFile(f); } - return Status::OK(); + // Rename the file instead of deletion to keep the history, and + // at the same time it is not visible to RocksDB. + return target()->RenameFile(f, f + "_renamed_"); } // If true, all manifest files will not be delted in DeleteFile(). diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 7dcdeefce96..df2fc38c23f 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -287,6 +287,24 @@ DEFINE_int64(cache_size, 2LL * KB * KB * KB, DEFINE_bool(cache_index_and_filter_blocks, false, "True if indexes/filters should be cached in block cache."); +DEFINE_int32( + top_level_index_pinning, + static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), + "Type of pinning for top-level indexes into metadata partitions (see " + "`enum PinningTier` in table.h)"); + +DEFINE_int32( + partition_pinning, + static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), + "Type of pinning for metadata partitions (see `enum PinningTier` in " + "table.h)"); + +DEFINE_int32( + unpartitioned_pinning, + static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), + "Type of pinning for unpartitioned metadata blocks (see `enum PinningTier` " + "in table.h)"); + DEFINE_bool(use_clock_cache, false, "Replace default LRU block cache with clock cache."); @@ -307,33 +325,68 @@ DEFINE_bool(enable_write_thread_adaptive_yield, true, "Use a yielding spin loop for brief writer thread waits."); #ifndef ROCKSDB_LITE -// BlobDB Options -DEFINE_bool(use_blob_db, false, "Use BlobDB."); +// Options for StackableDB-based BlobDB +DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB."); -DEFINE_uint64(blob_db_min_blob_size, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, - "Smallest blob to store in a file. Blobs smaller than this " - "will be inlined with the key in the LSM tree."); +DEFINE_uint64( + blob_db_min_blob_size, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size, + "[Stacked BlobDB] Smallest blob to store in a file. Blobs " + "smaller than this will be inlined with the key in the LSM tree."); -DEFINE_uint64(blob_db_bytes_per_sync, - ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, - "Sync blob files once per every N bytes written."); +DEFINE_uint64( + blob_db_bytes_per_sync, + ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync, + "[Stacked BlobDB] Sync blob files once per every N bytes written."); DEFINE_uint64(blob_db_file_size, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size, - "Target size of each blob file."); + "[Stacked BlobDB] Target size of each blob file."); DEFINE_bool( blob_db_enable_gc, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection, - "Enable BlobDB garbage collection."); + "[Stacked BlobDB] Enable BlobDB garbage collection."); DEFINE_double( blob_db_gc_cutoff, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff, - "Cutoff ratio for BlobDB garbage collection."); + "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection."); #endif // !ROCKSDB_LITE +// Options for integrated BlobDB +DEFINE_bool(allow_setting_blob_options_dynamically, false, + "[Integrated BlobDB] Allow setting blob options dynamically."); + +DEFINE_bool( + enable_blob_files, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files, + "[Integrated BlobDB] Enable writing large values to separate blob files."); + +DEFINE_uint64(min_blob_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size, + "[Integrated BlobDB] The size of the smallest value to be stored " + "separately in a blob file."); + +DEFINE_uint64(blob_file_size, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size, + "[Integrated BlobDB] The size limit for blob files."); + +DEFINE_string(blob_compression_type, "none", + "[Integrated BlobDB] The compression algorithm to use for large " + "values stored in blob files."); + +DEFINE_bool(enable_blob_garbage_collection, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .enable_blob_garbage_collection, + "[Integrated BlobDB] Enable blob garbage collection."); + +DEFINE_double(blob_garbage_collection_age_cutoff, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_age_cutoff, + "[Integrated BlobDB] The cutoff in terms of blob file age for " + "garbage collection."); + static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); @@ -357,6 +410,9 @@ DEFINE_bool(use_block_based_filter, false, "use block based filter" "instead of full filter for block based table"); +DEFINE_bool(use_ribbon_filter, false, + "Use Ribbon filter instead of Bloom filter"); + DEFINE_bool(partition_filters, false, "use partitioned filters " "for block-based table"); @@ -417,7 +473,6 @@ DEFINE_int32(kill_random_test, 0, "probability 1/this"); static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive); -extern int rocksdb_kill_odds; DEFINE_string(kill_exclude_prefixes, "", "If non-empty, kill points with prefix in the list given will be" @@ -605,6 +660,10 @@ DEFINE_int32(compression_zstd_max_train_bytes, 0, DEFINE_int32(compression_parallel_threads, 1, "Number of threads for parallel compression."); +DEFINE_uint64(compression_max_dict_buffer_bytes, 0, + "Buffering limit for SST file data to sample for dictionary " + "compression."); + DEFINE_string(bottommost_compression_type, "disable", "Algorithm to use to compress bottommost level of the database. " "\"disable\" means disabling the feature"); @@ -732,8 +791,28 @@ DEFINE_bool(paranoid_file_checks, true, "After writing every SST file, reopen it and read all the keys " "and validate checksums"); +DEFINE_bool(fail_if_options_file_error, false, + "Fail operations that fail to detect or properly persist options " + "file."); + +DEFINE_uint64(batch_protection_bytes_per_key, 0, + "If nonzero, enables integrity protection in `WriteBatch` at the " + "specified number of bytes per key. Currently the only supported " + "nonzero value is eight."); + DEFINE_string(file_checksum_impl, "none", "Name of an implementation for file_checksum_gen_factory, or " "\"none\" for null."); +DEFINE_int32(write_fault_one_in, 0, + "On non-zero, enables fault injection on write"); + +DEFINE_uint64(user_timestamp_size, 0, + "Number of bytes for a user-defined timestamp. Currently, only " + "8-byte is supported"); + +DEFINE_int32(open_metadata_write_fault_one_in, 0, + "On non-zero, enables fault injection on file metadata write " + "during DB reopen."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index 86310f82f75..03bc0784c74 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -29,6 +29,8 @@ DECLARE_bool(test_batches_snapshots); DECLARE_int32(compaction_thread_pool_adjust_interval); DECLARE_int32(continuous_verification_interval); DECLARE_int32(read_fault_one_in); +DECLARE_int32(write_fault_one_in); +DECLARE_int32(open_metadata_write_fault_one_in); namespace ROCKSDB_NAMESPACE { class StressTest; @@ -417,6 +419,8 @@ struct ThreadState { std::string value; // optional state of all keys in the db std::vector* key_vec; + + std::string timestamp; }; std::queue> snapshot_queue; diff --git a/db_stress_tool/db_stress_stat.h b/db_stress_tool/db_stress_stat.h index e8bc0986a67..429cf3b2a47 100644 --- a/db_stress_tool/db_stress_stat.h +++ b/db_stress_tool/db_stress_stat.h @@ -11,9 +11,9 @@ #include "monitoring/histogram.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/snapshot.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" #include "util/gflags_compat.h" #include "util/random.h" @@ -73,7 +73,7 @@ class Stats { seconds_ = 0; num_compact_files_succeed_ = 0; num_compact_files_failed_ = 0; - start_ = Env::Default()->NowMicros(); + start_ = SystemClock::Default()->NowMicros(); last_op_finish_ = start_; finish_ = start_; } @@ -102,13 +102,13 @@ class Stats { } void Stop() { - finish_ = Env::Default()->NowMicros(); + finish_ = SystemClock::Default()->NowMicros(); seconds_ = (finish_ - start_) * 1e-6; } void FinishedSingleOp() { if (FLAGS_histogram) { - auto now = Env::Default()->NowMicros(); + auto now = SystemClock::Default()->NowMicros(); auto micros = now - last_op_finish_; hist_.Add(micros); if (micros > 20000) { diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 38f435b6eeb..6a649eca9b8 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -15,22 +15,48 @@ #include "db_stress_tool/db_stress_table_properties_collector.h" #include "rocksdb/convenience.h" #include "rocksdb/sst_file_manager.h" +#include "rocksdb/types.h" #include "util/cast_util.h" +#include "utilities/backupable/backupable_db_impl.h" #include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { + +namespace { + +std::shared_ptr CreateFilterPolicy() { + if (FLAGS_bloom_bits < 0) { + return BlockBasedTableOptions().filter_policy; + } + const FilterPolicy* new_policy; + if (FLAGS_use_ribbon_filter) { + // Old and new API should be same + if (std::random_device()() & 1) { + new_policy = NewExperimentalRibbonFilterPolicy(FLAGS_bloom_bits); + } else { + new_policy = NewRibbonFilterPolicy(FLAGS_bloom_bits); + } + } else { + if (FLAGS_use_block_based_filter) { + new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, true); + } else { + new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false); + } + } + return std::shared_ptr(new_policy); +} + +} // namespace + StressTest::StressTest() : cache_(NewCache(FLAGS_cache_size)), compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)), - filter_policy_(FLAGS_bloom_bits >= 0 - ? FLAGS_use_block_based_filter - ? NewBloomFilterPolicy(FLAGS_bloom_bits, true) - : NewBloomFilterPolicy(FLAGS_bloom_bits, false) - : nullptr), + filter_policy_(CreateFilterPolicy()), db_(nullptr), #ifndef ROCKSDB_LITE txn_db_(nullptr), #endif + clock_(db_stress_env->GetSystemClock().get()), new_column_family_name_(1), num_times_reopened_(0), db_preload_finished_(false), @@ -104,6 +130,22 @@ std::shared_ptr StressTest::NewCache(size_t capacity) { } } +std::vector StressTest::GetBlobCompressionTags() { + std::vector compression_tags{"kNoCompression"}; + + if (Snappy_Supported()) { + compression_tags.emplace_back("kSnappyCompression"); + } + if (LZ4_Supported()) { + compression_tags.emplace_back("kLZ4Compression"); + } + if (ZSTD_Supported()) { + compression_tags.emplace_back("kZSTD"); + } + + return compression_tags; +} + bool StressTest::BuildOptionsTable() { if (FLAGS_set_options_one_in <= 0) { return true; @@ -182,6 +224,21 @@ bool StressTest::BuildOptionsTable() { {"max_sequential_skip_in_iterations", {"4", "8", "12"}}, }; + if (FLAGS_allow_setting_blob_options_dynamically) { + options_tbl.emplace("enable_blob_files", + std::vector{"false", "true"}); + options_tbl.emplace("min_blob_size", + std::vector{"0", "8", "16"}); + options_tbl.emplace("blob_file_size", + std::vector{"1M", "16M", "256M", "1G"}); + options_tbl.emplace("blob_compression_type", GetBlobCompressionTags()); + options_tbl.emplace("enable_blob_garbage_collection", + std::vector{"false", "true"}); + options_tbl.emplace( + "blob_garbage_collection_age_cutoff", + std::vector{"0.0", "0.25", "0.5", "0.75", "1.0"}); + } + options_table_ = std::move(options_tbl); for (const auto& iter : options_table_) { @@ -191,9 +248,9 @@ bool StressTest::BuildOptionsTable() { } void StressTest::InitDb() { - uint64_t now = db_stress_env->NowMicros(); + uint64_t now = clock_->NowMicros(); fprintf(stdout, "%s Initializing db_stress\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock_->TimeToString(now / 1000000).c_str()); PrintEnv(); Open(); BuildOptionsTable(); @@ -201,25 +258,28 @@ void StressTest::InitDb() { void StressTest::FinishInitDb(SharedState* shared) { if (FLAGS_read_only) { - uint64_t now = db_stress_env->NowMicros(); + uint64_t now = clock_->NowMicros(); fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n", - db_stress_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key); + clock_->TimeToString(now / 1000000).c_str(), FLAGS_max_key); PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared); } if (FLAGS_enable_compaction_filter) { - reinterpret_cast( - options_.compaction_filter_factory.get()) - ->SetSharedState(shared); + auto* compaction_filter_factory = + reinterpret_cast( + options_.compaction_filter_factory.get()); + assert(compaction_filter_factory); + compaction_filter_factory->SetSharedState(shared); + fprintf(stdout, "Compaction filter factory: %s\n", + compaction_filter_factory->Name()); } } bool StressTest::VerifySecondaries() { #ifndef ROCKSDB_LITE if (FLAGS_test_secondary) { - uint64_t now = db_stress_env->NowMicros(); - fprintf( - stdout, "%s Start to verify secondaries against primary\n", - db_stress_env->TimeToString(static_cast(now) / 1000000).c_str()); + uint64_t now = clock_->NowMicros(); + fprintf(stdout, "%s Start to verify secondaries against primary\n", + clock_->TimeToString(static_cast(now) / 1000000).c_str()); } for (size_t k = 0; k != secondaries_.size(); ++k) { Status s = secondaries_[k]->TryCatchUpWithPrimary(); @@ -261,10 +321,9 @@ bool StressTest::VerifySecondaries() { } } if (FLAGS_test_secondary) { - uint64_t now = db_stress_env->NowMicros(); - fprintf( - stdout, "%s Verification of secondaries succeeded\n", - db_stress_env->TimeToString(static_cast(now) / 1000000).c_str()); + uint64_t now = clock_->NowMicros(); + fprintf(stdout, "%s Verification of secondaries succeeded\n", + clock_->TimeToString(static_cast(now) / 1000000).c_str()); } #endif // ROCKSDB_LITE return true; @@ -278,6 +337,11 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf, } ReadOptions ropt; ropt.snapshot = snap_state.snapshot; + Slice ts; + if (!snap_state.timestamp.empty()) { + ts = snap_state.timestamp; + ropt.timestamp = &ts; + } PinnableSlice exp_v(&snap_state.value); exp_v.PinSelf(); PinnableSlice v; @@ -329,9 +393,11 @@ void StressTest::VerificationAbort(SharedState* shared, std::string msg, void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf, int64_t key) const { + auto key_str = Key(key); + Slice key_slice = key_str; fprintf(stderr, - "Verification failed for column family %d key %" PRIi64 ": %s\n", cf, - key, msg.c_str()); + "Verification failed for column family %d key %s (%" PRIi64 "): %s\n", + cf, key_slice.ToString(true).c_str(), key, msg.c_str()); shared->SetVerificationFailure(); } @@ -381,6 +447,13 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, } } else { if (!FLAGS_use_txn) { + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = NowNanosStr(); + ts = ts_str; + write_opts.timestamp = &ts; + } s = db_->Put(write_opts, cfh, key, v); } else { #ifndef ROCKSDB_LITE @@ -421,9 +494,9 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, #endif db_preload_finished_.store(true); - auto now = db_stress_env->NowMicros(); + auto now = clock_->NowMicros(); fprintf(stdout, "%s Reopening database in read-only\n", - db_stress_env->TimeToString(now / 1000000).c_str()); + clock_->TimeToString(now / 1000000).c_str()); // Reopen as read-only, can ignore all options related to updates Open(); } else { @@ -466,7 +539,7 @@ Status StressTest::NewTxn(WriteOptions& write_opts, Transaction** txn) { } static std::atomic txn_id = {0}; TransactionOptions txn_options; - txn_options.lock_timeout = 60000; // 1min + txn_options.lock_timeout = 600000; // 10 min txn_options.deadlock_detect = true; *txn = txn_db_->BeginTransaction(write_opts, txn_options); auto istr = std::to_string(txn_id.fetch_add(1)); @@ -520,6 +593,15 @@ void StressTest::OperateDb(ThreadState* thread) { fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(), FLAGS_read_fault_one_in); } + if (FLAGS_write_fault_one_in) { + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + std::vector types = {FileType::kTableFile, + FileType::kDescriptorFile, + FileType::kCurrentFile}; + fault_fs_guard->SetRandomWriteError( + thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, types); + } #endif // NDEBUG thread->stats.Start(); for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) { @@ -613,7 +695,8 @@ void StressTest::OperateDb(ThreadState* thread) { #ifndef ROCKSDB_LITE // Verify GetLiveFiles with a 1 in N chance. - if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in)) { + if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) && + !FLAGS_write_fault_one_in) { Status status = VerifyGetLiveFiles(); if (!status.ok()) { VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status); @@ -714,6 +797,20 @@ void StressTest::OperateDb(ThreadState* thread) { } } + // Assign timestamps if necessary. + std::string read_ts_str; + std::string write_ts_str; + Slice read_ts; + Slice write_ts; + if (ShouldAcquireMutexOnKey() && FLAGS_user_timestamp_size > 0) { + read_ts_str = GenerateTimestampForRead(); + read_ts = read_ts_str; + read_opts.timestamp = &read_ts; + write_ts_str = NowNanosStr(); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + } + int prob_op = thread->rand.Uniform(100); // Reset this in case we pick something other than a read op. We don't // want to use a stale value when deciding at the beginning of the loop @@ -804,8 +901,16 @@ std::vector StressTest::GetWhiteBoxKeys(ThreadState* thread, std::vector boundaries; for (const LevelMetaData& lmd : cfmd.levels) { for (const SstFileMetaData& sfmd : lmd.files) { - boundaries.push_back(sfmd.smallestkey); - boundaries.push_back(sfmd.largestkey); + // If FLAGS_user_timestamp_size > 0, then both smallestkey and largestkey + // have timestamps. + const auto& skey = sfmd.smallestkey; + const auto& lkey = sfmd.largestkey; + assert(skey.size() >= FLAGS_user_timestamp_size); + assert(lkey.size() >= FLAGS_user_timestamp_size); + boundaries.push_back( + skey.substr(0, skey.size() - FLAGS_user_timestamp_size)); + boundaries.push_back( + lkey.substr(0, lkey.size() - FLAGS_user_timestamp_size)); } } if (boundaries.empty()) { @@ -955,6 +1060,7 @@ Status StressTest::TestIterate(ThreadState* thread, // iterators with the same set-up, and it doesn't hurt to check them // to be equal. ReadOptions cmp_ro; + cmp_ro.timestamp = readoptionscopy.timestamp; cmp_ro.snapshot = snapshot; cmp_ro.total_order_seek = true; ColumnFamilyHandle* cmp_cfh = @@ -1074,21 +1180,25 @@ void StressTest::VerifyIterator(ThreadState* thread, *diverged = true; return; } else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr && - (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >= - 0 || + (options_.comparator->CompareWithoutTimestamp( + *ro.iterate_lower_bound, /*a_has_ts=*/false, seek_key, + /*b_has_ts=*/false) >= 0 || (ro.iterate_upper_bound != nullptr && - options_.comparator->Compare(*ro.iterate_lower_bound, - *ro.iterate_upper_bound) >= 0))) { + options_.comparator->CompareWithoutTimestamp( + *ro.iterate_lower_bound, /*a_has_ts=*/false, + *ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) { // Lower bound behavior is not well defined if it is larger than // seek key or upper bound. Disable the check for now. *diverged = true; return; } else if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr && - (options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <= - 0 || + (options_.comparator->CompareWithoutTimestamp( + *ro.iterate_upper_bound, /*a_has_ts=*/false, seek_key, + /*b_has_ts=*/false) <= 0 || (ro.iterate_lower_bound != nullptr && - options_.comparator->Compare(*ro.iterate_lower_bound, - *ro.iterate_upper_bound) >= 0))) { + options_.comparator->CompareWithoutTimestamp( + *ro.iterate_lower_bound, /*a_has_ts=*/false, + *ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) { // Uppder bound behavior is not well defined if it is smaller than // seek key or lower bound. Disable the check for now. *diverged = true; @@ -1157,9 +1267,13 @@ void StressTest::VerifyIterator(ThreadState* thread, if ((iter->Valid() && iter->key() != cmp_iter->key()) || (!iter->Valid() && (ro.iterate_upper_bound == nullptr || - cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) && + cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false, + *ro.iterate_upper_bound, + /*b_has_ts=*/false) < 0) && (ro.iterate_lower_bound == nullptr || - cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) { + cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false, + *ro.iterate_lower_bound, + /*b_has_ts=*/false) > 0))) { fprintf(stderr, "Iterator diverged from control iterator which" " has value %s %s\n", @@ -1245,11 +1359,6 @@ Status StressTest::TestBackupRestore( backup_opts.share_files_with_checksum_naming | BackupableDBOptions::kFlagIncludeFileSize; } - if (thread->rand.OneIn(2)) { - backup_opts.share_files_with_checksum_naming = - backup_opts.share_files_with_checksum_naming | - BackupableDBOptions::kFlagMatchInterimNaming; - } } } BackupEngine* backup_engine = nullptr; @@ -1259,6 +1368,12 @@ Status StressTest::TestBackupRestore( from = "BackupEngine::Open"; } if (s.ok()) { + if (thread->rand.OneIn(2)) { + TEST_FutureSchemaVersion2Options test_opts; + test_opts.crc32c_checksums = thread->rand.OneIn(2) == 0; + test_opts.file_sizes = thread->rand.OneIn(2) == 0; + TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts); + } s = backup_engine->CreateNewBackup(db_); if (!s.ok()) { from = "BackupEngine::CreateNewBackup"; @@ -1273,8 +1388,13 @@ Status StressTest::TestBackupRestore( } } std::vector backup_info; + // If inplace_not_restore, we verify the backup by opening it as a + // read-only DB. If !inplace_not_restore, we restore it to a temporary + // directory for verification. + bool inplace_not_restore = thread->rand.OneIn(3); if (s.ok()) { - backup_engine->GetBackupInfo(&backup_info); + backup_engine->GetBackupInfo(&backup_info, + /*include_file_details*/ inplace_not_restore); if (backup_info.empty()) { s = Status::NotFound("no backups found"); from = "BackupEngine::GetBackupInfo"; @@ -1290,8 +1410,8 @@ Status StressTest::TestBackupRestore( } const bool allow_persistent = thread->tid == 0; // not too many bool from_latest = false; - if (s.ok()) { - int count = static_cast(backup_info.size()); + int count = static_cast(backup_info.size()); + if (s.ok() && !inplace_not_restore) { if (count > 1) { s = backup_engine->RestoreDBFromBackup( RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id, @@ -1309,7 +1429,9 @@ Status StressTest::TestBackupRestore( } } } - if (s.ok()) { + if (s.ok() && !inplace_not_restore) { + // Purge early if restoring, to ensure the restored directory doesn't + // have some secret dependency on the backup directory. uint32_t to_keep = 0; if (allow_persistent) { // allow one thread to keep up to 2 backups @@ -1337,10 +1459,21 @@ Status StressTest::TestBackupRestore( for (auto name : column_family_names_) { cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options)); } - s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors, - &restored_cf_handles, &restored_db); - if (!s.ok()) { - from = "DB::Open in backup/restore"; + if (inplace_not_restore) { + BackupInfo& info = backup_info[thread->rand.Uniform(count)]; + restore_options.env = info.env_for_open.get(); + s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open, + cf_descriptors, &restored_cf_handles, + &restored_db); + if (!s.ok()) { + from = "DB::OpenForReadOnly in backup/restore"; + } + } else { + s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors, + &restored_cf_handles, &restored_db); + if (!s.ok()) { + from = "DB::Open in backup/restore"; + } } } // Note the column families chosen by `rand_column_families` cannot be @@ -1354,8 +1487,16 @@ Status StressTest::TestBackupRestore( std::string key_str = Key(rand_keys[0]); Slice key = key_str; std::string restored_value; + ReadOptions read_opts; + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + read_opts.timestamp = &ts; + } Status get_status = restored_db->Get( - ReadOptions(), restored_cf_handles[rand_column_families[i]], key, + read_opts, restored_cf_handles[rand_column_families[i]], key, &restored_value); bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]); if (get_status.ok()) { @@ -1373,10 +1514,6 @@ Status StressTest::TestBackupRestore( } } } - if (backup_engine != nullptr) { - delete backup_engine; - backup_engine = nullptr; - } if (restored_db != nullptr) { for (auto* cf_handle : restored_cf_handles) { restored_db->DestroyColumnFamilyHandle(cf_handle); @@ -1384,6 +1521,22 @@ Status StressTest::TestBackupRestore( delete restored_db; restored_db = nullptr; } + if (s.ok() && inplace_not_restore) { + // Purge late if inplace open read-only + uint32_t to_keep = 0; + if (allow_persistent) { + // allow one thread to keep up to 2 backups + to_keep = thread->rand.Uniform(3); + } + s = backup_engine->PurgeOldBackups(to_keep); + if (!s.ok()) { + from = "BackupEngine::PurgeOldBackups"; + } + } + if (backup_engine != nullptr) { + delete backup_engine; + backup_engine = nullptr; + } if (s.ok()) { // Preserve directories on failure, or allowed persistent backup if (!allow_persistent) { @@ -1455,7 +1608,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread, FLAGS_db + "/.checkpoint" + ToString(thread->tid); Options tmp_opts(options_); tmp_opts.listeners.clear(); - tmp_opts.env = db_stress_env->target(); + tmp_opts.env = db_stress_env; DestroyDB(checkpoint_dir, tmp_opts); @@ -1488,11 +1641,11 @@ Status StressTest::TestCheckpoint(ThreadState* thread, } } } + delete checkpoint; + checkpoint = nullptr; std::vector cf_handles; DB* checkpoint_db = nullptr; if (s.ok()) { - delete checkpoint; - checkpoint = nullptr; Options options(options_); options.listeners.clear(); std::vector cf_descs; @@ -1677,7 +1830,7 @@ Status StressTest::TestPauseBackground(ThreadState* thread) { // 1 chance in 625 of pausing full 16s.) int pwr2_micros = std::min(thread->rand.Uniform(25), thread->rand.Uniform(25)); - db_stress_env->SleepForMicroseconds(1 << pwr2_micros); + clock_->SleepForMicroseconds(1 << pwr2_micros); return db_->ContinueBackgroundWork(); } @@ -1686,6 +1839,7 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread, const std::string& keystr, uint64_t i) { Slice key = keystr; ColumnFamilyHandle* column_family = column_families_[rand_column_family]; + ReadOptions ropt; #ifndef ROCKSDB_LITE auto db_impl = static_cast_with_check(db_->GetRootDB()); const bool ww_snapshot = thread->rand.OneIn(10); @@ -1695,8 +1849,19 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread, #else const Snapshot* snapshot = db_->GetSnapshot(); #endif // !ROCKSDB_LITE - ReadOptions ropt; ropt.snapshot = snapshot; + + // Ideally, we want snapshot taking and timestamp generation to be atomic + // here, so that the snapshot corresponds to the timestamp. However, it is + // not possible with current GetSnapshot() API. + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + ropt.timestamp = &ts; + } + std::string value_at; // When taking a snapshot, we also read a key from that snapshot. We // will later read the same key before releasing the snapshot and @@ -1718,10 +1883,14 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread, } } - ThreadState::SnapshotState snap_state = { - snapshot, rand_column_family, column_family->GetName(), - keystr, status_at, value_at, - key_vec}; + ThreadState::SnapshotState snap_state = {snapshot, + rand_column_family, + column_family->GetName(), + keystr, + status_at, + value_at, + key_vec, + ts_str}; uint64_t hold_for = FLAGS_snapshot_hold_ops; if (FLAGS_long_running_snapshots) { // Hold 10% of snapshots for 10x more @@ -1826,6 +1995,13 @@ uint32_t StressTest::GetRangeHash(ThreadState* thread, const Snapshot* snapshot, ReadOptions ro; ro.snapshot = snapshot; ro.total_order_seek = true; + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + ro.timestamp = &ts; + } std::unique_ptr it(db_->NewIterator(ro, column_family)); for (it->Seek(start_key); it->Valid() && options_.comparator->Compare(it->key(), end_key) <= 0; @@ -1852,7 +2028,7 @@ void StressTest::PrintEnv() const { fprintf(stdout, "TransactionDB : %s\n", FLAGS_use_txn ? "true" : "false"); #ifndef ROCKSDB_LITE - fprintf(stdout, "BlobDB : %s\n", + fprintf(stdout, "Stacked BlobDB : %s\n", FLAGS_use_blob_db ? "true" : "false"); #endif // !ROCKSDB_LITE fprintf(stdout, "Read only mode : %s\n", @@ -1927,13 +2103,16 @@ void StressTest::PrintEnv() const { fprintf(stdout, "Memtablerep : %s\n", memtablerep); - fprintf(stdout, "Test kill odd : %d\n", rocksdb_kill_odds); - if (!rocksdb_kill_exclude_prefixes.empty()) { +#ifndef NDEBUG + KillPoint* kp = KillPoint::GetInstance(); + fprintf(stdout, "Test kill odd : %d\n", kp->rocksdb_kill_odds); + if (!kp->rocksdb_kill_exclude_prefixes.empty()) { fprintf(stdout, "Skipping kill points prefixes:\n"); - for (auto& p : rocksdb_kill_exclude_prefixes) { + for (auto& p : kp->rocksdb_kill_exclude_prefixes) { fprintf(stdout, " %s\n", p.c_str()); } } +#endif fprintf(stdout, "Periodic Compaction Secs : %" PRIu64 "\n", FLAGS_periodic_compaction_seconds); fprintf(stdout, "Compaction TTL : %" PRIu64 "\n", @@ -1947,9 +2126,17 @@ void StressTest::PrintEnv() const { fprintf(stdout, "Use dynamic level : %d\n", static_cast(FLAGS_level_compaction_dynamic_level_bytes)); fprintf(stdout, "Read fault one in : %d\n", FLAGS_read_fault_one_in); + fprintf(stdout, "Write fault one in : %d\n", FLAGS_write_fault_one_in); + fprintf(stdout, "Open metadata write fault one in:\n"); + fprintf(stdout, " %d\n", + FLAGS_open_metadata_write_fault_one_in); fprintf(stdout, "Sync fault injection : %d\n", FLAGS_sync_fault_injection); fprintf(stdout, "Best efforts recovery : %d\n", static_cast(FLAGS_best_efforts_recovery)); + fprintf(stdout, "Fail if OPTIONS file error: %d\n", + static_cast(FLAGS_fail_if_options_file_error)); + fprintf(stdout, "User timestamp size bytes : %d\n", + static_cast(FLAGS_user_timestamp_size)); fprintf(stdout, "------------------------------------------------\n"); } @@ -1964,6 +2151,12 @@ void StressTest::Open() { block_based_options.block_cache = cache_; block_based_options.cache_index_and_filter_blocks = FLAGS_cache_index_and_filter_blocks; + block_based_options.metadata_cache_options.top_level_index_pinning = + static_cast(FLAGS_top_level_index_pinning); + block_based_options.metadata_cache_options.partition_pinning = + static_cast(FLAGS_partition_pinning); + block_based_options.metadata_cache_options.unpartitioned_pinning = + static_cast(FLAGS_unpartitioned_pinning); block_based_options.block_cache_compressed = compressed_cache_; block_based_options.checksum = checksum_type_e; block_based_options.block_size = FLAGS_block_size; @@ -2028,6 +2221,8 @@ void StressTest::Open() { FLAGS_compression_zstd_max_train_bytes; options_.compression_opts.parallel_threads = FLAGS_compression_parallel_threads; + options_.compression_opts.max_dict_buffer_bytes = + FLAGS_compression_max_dict_buffer_bytes; options_.create_if_missing = true; options_.max_manifest_file_size = FLAGS_max_manifest_file_size; options_.inplace_update_support = FLAGS_in_place_update; @@ -2058,6 +2253,18 @@ void StressTest::Open() { FLAGS_level_compaction_dynamic_level_bytes; options_.file_checksum_gen_factory = GetFileChecksumImpl(FLAGS_file_checksum_impl); + options_.track_and_verify_wals_in_manifest = true; + + // Integrated BlobDB + options_.enable_blob_files = FLAGS_enable_blob_files; + options_.min_blob_size = FLAGS_min_blob_size; + options_.blob_file_size = FLAGS_blob_file_size; + options_.blob_compression_type = + StringToCompressionType(FLAGS_blob_compression_type.c_str()); + options_.enable_blob_garbage_collection = + FLAGS_enable_blob_garbage_collection; + options_.blob_garbage_collection_age_cutoff = + FLAGS_blob_garbage_collection_age_cutoff; } else { #ifdef ROCKSDB_LITE fprintf(stderr, "--options_file not supported in lite mode\n"); @@ -2146,10 +2353,38 @@ void StressTest::Open() { options_.best_efforts_recovery = FLAGS_best_efforts_recovery; options_.paranoid_file_checks = FLAGS_paranoid_file_checks; + options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error; + + if ((options_.enable_blob_files || options_.enable_blob_garbage_collection || + FLAGS_allow_setting_blob_options_dynamically) && + (FLAGS_use_merge || FLAGS_best_efforts_recovery)) { + fprintf(stderr, + "Integrated BlobDB is currently incompatible with Merge, " + "and best-effort recovery\n"); + exit(1); + } + + if (options_.enable_blob_files) { + fprintf(stdout, + "Integrated BlobDB: blob files enabled, min blob size %" PRIu64 + ", blob file size %" PRIu64 ", blob compression type %s\n", + options_.min_blob_size, options_.blob_file_size, + CompressionTypeToString(options_.blob_compression_type).c_str()); + } + + if (options_.enable_blob_garbage_collection) { + fprintf(stdout, "Integrated BlobDB: blob GC enabled, cutoff %f\n", + options_.blob_garbage_collection_age_cutoff); + } fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); Status s; + + if (FLAGS_user_timestamp_size > 0) { + CheckAndSetOptionsForUserTimestamp(); + } + if (FLAGS_ttl == -1) { std::vector existing_column_families; s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db, @@ -2203,32 +2438,88 @@ void StressTest::Open() { new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors)); options_.create_missing_column_families = true; if (!FLAGS_use_txn) { +#ifndef NDEBUG + // Determine whether we need to ingest file metadata write failures + // during DB reopen. If it does, enable it. + // Only ingest metadata error if it is reopening, as initial open + // failure doesn't need to be handled. + // TODO cover transaction DB is not covered in this fault test too. + bool ingest_meta_error = + FLAGS_open_metadata_write_fault_one_in && + fault_fs_guard + ->FileExists(FLAGS_db + "/CURRENT", IOOptions(), nullptr) + .ok(); + if (ingest_meta_error) { + fault_fs_guard->EnableMetadataWriteErrorInjection(); + fault_fs_guard->SetRandomMetadataWriteError( + FLAGS_open_metadata_write_fault_one_in); + } + while (true) { +#endif // NDEBUG #ifndef ROCKSDB_LITE - if (FLAGS_use_blob_db) { - blob_db::BlobDBOptions blob_db_options; - blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; - blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; - blob_db_options.blob_file_size = FLAGS_blob_db_file_size; - blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; - blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; - - blob_db::BlobDB* blob_db = nullptr; - s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db, - cf_descriptors, &column_families_, &blob_db); - if (s.ok()) { - db_ = blob_db; - } - } else + // StackableDB-based BlobDB + if (FLAGS_use_blob_db) { + blob_db::BlobDBOptions blob_db_options; + blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size; + blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync; + blob_db_options.blob_file_size = FLAGS_blob_db_file_size; + blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc; + blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff; + + blob_db::BlobDB* blob_db = nullptr; + s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db, + cf_descriptors, &column_families_, + &blob_db); + if (s.ok()) { + db_ = blob_db; + } + } else #endif // !ROCKSDB_LITE - { - if (db_preload_finished_.load() && FLAGS_read_only) { - s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors, - &column_families_, &db_); - } else { - s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, - &column_families_, &db_); + { + if (db_preload_finished_.load() && FLAGS_read_only) { + s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, + cf_descriptors, &column_families_, &db_); + } else { + s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors, + &column_families_, &db_); + } + } + +#ifndef NDEBUG + if (ingest_meta_error) { + fault_fs_guard->DisableMetadataWriteErrorInjection(); + if (s.ok()) { + // Ingested errors might happen in background compactions. We + // wait for all compactions to finish to make sure DB is in + // clean state before executing queries. + s = static_cast_with_check(db_->GetRootDB()) + ->TEST_WaitForCompact(true); + if (!s.ok()) { + delete db_; + } + } + if (!s.ok()) { + // After failure to opening a DB due to IO error, retry should + // successfully open the DB with correct data if no IO error shows + // up. + ingest_meta_error = false; + + Random rand(static_cast(FLAGS_seed)); + if (rand.OneIn(2)) { + fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(), + nullptr); + } + if (rand.OneIn(3)) { + fault_fs_guard->DropUnsyncedFileData(); + } else if (rand.OneIn(2)) { + fault_fs_guard->DropRandomUnsyncedFileData(&rand); + } + continue; + } } + break; } +#endif // NDEBUG } else { #ifndef ROCKSDB_LITE TransactionDBOptions txn_db_options; @@ -2395,11 +2686,77 @@ void StressTest::Reopen(ThreadState* thread) { secondaries_.clear(); num_times_reopened_++; - auto now = db_stress_env->NowMicros(); + auto now = clock_->NowMicros(); fprintf(stdout, "%s Reopening database for the %dth time\n", - db_stress_env->TimeToString(now / 1000000).c_str(), - num_times_reopened_); + clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_); Open(); } + +void StressTest::CheckAndSetOptionsForUserTimestamp() { + assert(FLAGS_user_timestamp_size > 0); + const Comparator* const cmp = test::ComparatorWithU64Ts(); + assert(cmp); + if (FLAGS_user_timestamp_size != cmp->timestamp_size()) { + fprintf(stderr, + "Only -user_timestamp_size=%d is supported in stress test.\n", + static_cast(cmp->timestamp_size())); + exit(1); + } + if (FLAGS_nooverwritepercent > 0) { + fprintf(stderr, + "-nooverwritepercent must be 0 because SingleDelete must be " + "disabled.\n"); + exit(1); + } + if (FLAGS_use_merge || FLAGS_use_full_merge_v1) { + fprintf(stderr, "Merge does not support timestamp yet.\n"); + exit(1); + } + if (FLAGS_delrangepercent > 0) { + fprintf(stderr, "DeleteRange does not support timestamp yet.\n"); + exit(1); + } + if (FLAGS_use_txn) { + fprintf(stderr, "TransactionDB does not support timestamp yet.\n"); + exit(1); + } + if (FLAGS_read_only) { + fprintf(stderr, "When opened as read-only, timestamp not supported.\n"); + exit(1); + } + if (FLAGS_test_secondary || FLAGS_secondary_catch_up_one_in > 0 || + FLAGS_continuous_verification_interval > 0) { + fprintf(stderr, "Secondary instance does not support timestamp.\n"); + exit(1); + } + if (FLAGS_checkpoint_one_in > 0) { + fprintf(stderr, + "-checkpoint_one_in=%d requires " + "DBImplReadOnly, which is not supported with timestamp\n", + FLAGS_checkpoint_one_in); + exit(1); + } +#ifndef ROCKSDB_LITE + if (FLAGS_enable_blob_files || FLAGS_use_blob_db) { + fprintf(stderr, "BlobDB not supported with timestamp.\n"); + exit(1); + } +#endif // !ROCKSDB_LITE + if (FLAGS_enable_compaction_filter) { + fprintf(stderr, "CompactionFilter not supported with timestamp.\n"); + exit(1); + } + if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) { + fprintf(stderr, + "Due to per-key ts-seq ordering constraint, only the (default) " + "non-batched test is supported with timestamp.\n"); + exit(1); + } + if (FLAGS_ingest_external_file_one_in > 0) { + fprintf(stderr, "Bulk loading may not support timestamp yet.\n"); + exit(1); + } + options_.comparator = cmp; +} } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index 426af3bd082..351dc0137b4 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -13,6 +13,7 @@ #include "db_stress_tool/db_stress_shared_state.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; class Transaction; class TransactionDB; @@ -24,6 +25,8 @@ class StressTest { std::shared_ptr NewCache(size_t capacity); + static std::vector GetBlobCompressionTags(); + bool BuildOptionsTable(); void InitDb(); @@ -208,6 +211,8 @@ class StressTest { void Reopen(ThreadState* thread); + void CheckAndSetOptionsForUserTimestamp(); + std::shared_ptr cache_; std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; @@ -216,6 +221,7 @@ class StressTest { TransactionDB* txn_db_; #endif Options options_; + SystemClock* clock_; std::vector column_families_; std::vector column_family_names_; std::atomic new_column_family_name_; diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index 2126a143634..cb489eb1340 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -23,6 +23,7 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_driver.h" +#include "rocksdb/convenience.h" #ifndef NDEBUG #include "utilities/fault_injection_fs.h" #endif @@ -34,11 +35,6 @@ static std::shared_ptr env_wrapper_guard; static std::shared_ptr fault_env_guard; } // namespace -static Env* GetCompositeEnv(std::shared_ptr fs) { - static std::shared_ptr composite_env = NewCompositeEnv(fs); - return composite_env.get(); -} - KeyGenContext key_gen_ctx; int db_stress_tool(int argc, char** argv) { @@ -78,39 +74,53 @@ int db_stress_tool(int argc, char** argv) { if (!FLAGS_hdfs.empty()) { raw_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs); - } else if (!FLAGS_env_uri.empty()) { - Status s = Env::LoadEnv(FLAGS_env_uri, &raw_env, &env_guard); - if (raw_env == nullptr) { - fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str()); - exit(1); - } - } else if (!FLAGS_fs_uri.empty()) { - std::shared_ptr fs; - Status s = FileSystem::Load(FLAGS_fs_uri, &fs); + } else { + Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri, + &raw_env, &env_guard); if (!s.ok()) { - fprintf(stderr, "Error: %s\n", s.ToString().c_str()); + fprintf(stderr, "Error Creating Env URI: %s: %s\n", FLAGS_env_uri.c_str(), + s.ToString().c_str()); exit(1); } - raw_env = GetCompositeEnv(fs); - } else { - raw_env = Env::Default(); } #ifndef NDEBUG - if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection) { + if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection || + FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in) { FaultInjectionTestFS* fs = new FaultInjectionTestFS(raw_env->GetFileSystem()); fault_fs_guard.reset(fs); - fault_fs_guard->SetFilesystemDirectWritable(true); + if (FLAGS_write_fault_one_in) { + fault_fs_guard->SetFilesystemDirectWritable(false); + } else { + fault_fs_guard->SetFilesystemDirectWritable(true); + } fault_env_guard = std::make_shared(raw_env, fault_fs_guard); raw_env = fault_env_guard.get(); } + if (FLAGS_write_fault_one_in) { + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_guard->EnableWriteErrorInjection(); }); + SyncPoint::GetInstance()->EnableProcessing(); + } #endif env_wrapper_guard = std::make_shared(raw_env); db_stress_env = env_wrapper_guard.get(); +#ifndef NDEBUG + if (FLAGS_write_fault_one_in) { + // In the write injection case, we need to use the FS interface and returns + // the IOStatus with different error and flags. Therefore, + // DbStressEnvWrapper cannot be used which will swallow the FS + // implementations. We should directly use the raw_env which is the + // CompositeEnvWrapper of env and fault_fs. + db_stress_env = raw_env; + } +#endif + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); // The number of background threads should be at least as much the @@ -131,17 +141,22 @@ int db_stress_tool(int argc, char** argv) { "test_batches_snapshots test!\n"); exit(1); } - if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) { + if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0 && + !FLAGS_memtable_whole_key_filtering) { fprintf(stderr, - "Error: please specify positive prefix_size in order to use " - "memtable_prefix_bloom_size_ratio\n"); + "Error: please specify positive prefix_size or enable whole key " + "filtering in order to use memtable_prefix_bloom_size_ratio\n"); exit(1); } if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent + FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent) != 100) { fprintf(stderr, - "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != " - "100!\n"); + "Error: " + "Read(%d)+Prefix(%d)+Write(%d)+Delete(%d)+DeleteRange(%d)" + "+Iterate(%d) percents != " + "100!\n", + FLAGS_readpercent, FLAGS_prefixpercent, FLAGS_writepercent, + FLAGS_delpercent, FLAGS_delrangepercent, FLAGS_iterpercent); exit(1); } if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) { @@ -264,9 +279,19 @@ int db_stress_tool(int argc, char** argv) { "test_batches_snapshots must all be 0 when using compaction filter\n"); exit(1); } + if (FLAGS_batch_protection_bytes_per_key > 0 && + !FLAGS_test_batches_snapshots) { + fprintf(stderr, + "Error: test_batches_snapshots must be enabled when " + "batch_protection_bytes_per_key > 0\n"); + exit(1); + } - rocksdb_kill_odds = FLAGS_kill_random_test; - rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes); +#ifndef NDEBUG + KillPoint* kp = KillPoint::GetInstance(); + kp->rocksdb_kill_odds = FLAGS_kill_random_test; + kp->rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes); +#endif unsigned int levels = FLAGS_max_key_len; std::vector weights; diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 269d0886d3f..a4ca23d87a0 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -22,6 +22,13 @@ class NonBatchedOpsStressTest : public StressTest { void VerifyDb(ThreadState* thread) const override { ReadOptions options(FLAGS_verify_checksum, true); + std::string ts_str; + Slice ts; + if (FLAGS_user_timestamp_size > 0) { + ts_str = GenerateTimestampForRead(); + ts = ts_str; + options.timestamp = &ts; + } auto shared = thread->shared; const int64_t max_key = shared->GetMaxKey(); const int64_t keys_per_thread = max_key / shared->GetNumThreads(); @@ -477,6 +484,8 @@ class NonBatchedOpsStressTest : public StressTest { int64_t max_key = shared->GetMaxKey(); int64_t rand_key = rand_keys[0]; int rand_column_family = rand_column_families[0]; + std::string write_ts_str; + Slice write_ts; while (!shared->AllowsOverwrite(rand_key) && (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) { lock.reset(); @@ -484,6 +493,11 @@ class NonBatchedOpsStressTest : public StressTest { rand_column_family = thread->rand.Next() % FLAGS_column_families; lock.reset( new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key))); + if (FLAGS_user_timestamp_size > 0) { + write_ts_str = NowNanosStr(); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + } } std::string key_str = Key(rand_key); @@ -559,6 +573,8 @@ class NonBatchedOpsStressTest : public StressTest { // OPERATION delete // If the chosen key does not allow overwrite and it does not exist, // choose another key. + std::string write_ts_str; + Slice write_ts; while (!shared->AllowsOverwrite(rand_key) && !shared->Exists(rand_column_family, rand_key)) { lock.reset(); @@ -566,6 +582,11 @@ class NonBatchedOpsStressTest : public StressTest { rand_column_family = thread->rand.Next() % FLAGS_column_families; lock.reset( new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key))); + if (FLAGS_user_timestamp_size > 0) { + write_ts_str = NowNanosStr(); + write_ts = write_ts_str; + write_opts.timestamp = &write_ts; + } } std::string key_str = Key(rand_key); diff --git a/docs/Gemfile b/docs/Gemfile index de6f39519db..d78e4354d29 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -1,2 +1,4 @@ source 'https://rubygems.org' -gem 'github-pages', '~> 207' +gem 'github-pages', '~> 209' + +gem "webrick", "~> 1.7" diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 7702aece13b..5a366b7d448 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -1,7 +1,7 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.3.2) + activesupport (6.0.3.4) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 0.7, < 2) minitest (~> 5.1) @@ -17,37 +17,40 @@ GEM commonmarker (0.17.13) ruby-enum (~> 0.5) concurrent-ruby (1.1.7) - dnsruby (1.61.4) + dnsruby (1.61.5) simpleidn (~> 0.1) - em-websocket (0.5.1) + em-websocket (0.5.2) eventmachine (>= 0.12.9) http_parser.rb (~> 0.6.0) ethon (0.12.0) ffi (>= 1.3.0) eventmachine (1.2.7) execjs (2.7.0) - faraday (1.0.1) + faraday (1.3.0) + faraday-net_http (~> 1.0) multipart-post (>= 1.2, < 3) - ffi (1.13.1) + ruby2_keywords + faraday-net_http (1.0.0) + ffi (1.14.2) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (207) + github-pages (209) github-pages-health-check (= 1.16.1) jekyll (= 3.9.0) jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) jekyll-commonmark-ghpages (= 0.1.6) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.13.0) + jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) jekyll-github-metadata (= 2.13.0) - jekyll-mentions (= 1.5.1) + jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.15.0) + jekyll-redirect-from (= 0.16.0) jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.1) + jekyll-remote-theme (= 0.4.2) jekyll-sass-converter (= 1.5.2) jekyll-seo-tag (= 2.6.1) jekyll-sitemap (= 1.4.0) @@ -55,7 +58,7 @@ GEM jekyll-theme-architect (= 0.1.1) jekyll-theme-cayman (= 0.1.1) jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) + jekyll-theme-hacker (= 0.1.2) jekyll-theme-leap-day (= 0.1.1) jekyll-theme-merlot (= 0.1.1) jekyll-theme-midnight (= 0.1.1) @@ -66,14 +69,14 @@ GEM jekyll-theme-tactile (= 0.1.1) jekyll-theme-time-machine (= 0.1.1) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.11.1) - kramdown (= 2.3.0) + jemoji (= 0.12.0) + kramdown (= 2.3.1) kramdown-parser-gfm (= 1.1.0) liquid (= 4.0.3) mercenary (~> 0.3) minima (= 2.5.1) nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.19.0) + rouge (= 3.23.0) terminal-table (~> 1.4) github-pages-health-check (1.16.1) addressable (~> 2.3) @@ -81,7 +84,7 @@ GEM octokit (~> 4.0) public_suffix (~> 3.0) typhoeus (~> 1.3) - html-pipeline (2.13.0) + html-pipeline (2.14.0) activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.6.0) @@ -114,14 +117,14 @@ GEM rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.13.0) + jekyll-feed (0.15.1) jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) jekyll-github-metadata (2.13.0) jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.5.1) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) jekyll (>= 3.7, < 5.0) jekyll-optional-front-matter (0.3.2) @@ -129,14 +132,15 @@ GEM jekyll-paginate (1.1.0) jekyll-readme-index (0.3.0) jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.15.0) + jekyll-redirect-from (0.16.0) jekyll (>= 3.3, < 5.0) jekyll-relative-links (0.6.1) jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.1) + jekyll-remote-theme (0.4.2) addressable (~> 2.0) jekyll (>= 3.5, < 5.0) - rubyzip (>= 1.3.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) jekyll-seo-tag (2.6.1) @@ -153,8 +157,8 @@ GEM jekyll-theme-dinky (0.1.1) jekyll (~> 3.5) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.1.2) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) jekyll-theme-leap-day (0.1.1) jekyll (~> 3.5) @@ -188,41 +192,44 @@ GEM jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.11.1) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) - kramdown (2.3.0) + kramdown (2.3.1) rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) liquid (4.0.3) - listen (3.2.1) + listen (3.4.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.4.0) + mini_portile2 (2.5.1) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.14.1) + minitest (5.14.3) multipart-post (2.1.1) - nokogiri (1.10.10) - mini_portile2 (~> 2.4.0) - octokit (4.18.0) + nokogiri (1.11.4) + mini_portile2 (~> 2.5.0) + racc (~> 1.4) + octokit (4.20.0) faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (3.1.1) + racc (1.5.2) rb-fsevent (0.10.4) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.4) - rouge (3.19.0) + rexml (3.2.5) + rouge (3.23.0) ruby-enum (0.8.0) i18n + ruby2_keywords (0.0.2) rubyzip (2.3.0) safe_yaml (1.0.5) sass (3.7.4) @@ -240,19 +247,21 @@ GEM thread_safe (0.3.6) typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.7) + tzinfo (1.2.9) thread_safe (~> 0.1) unf (0.1.4) unf_ext unf_ext (0.0.7.7) unicode-display_width (1.7.0) - zeitwerk (2.4.0) + webrick (1.7.0) + zeitwerk (2.4.2) PLATFORMS ruby DEPENDENCIES - github-pages (~> 207) + github-pages (~> 209) + webrick (~> 1.7) BUNDLED WITH - 2.1.4 + 2.2.3 diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml index 13225be9dfd..dca958ea244 100644 --- a/docs/_data/authors.yml +++ b/docs/_data/authors.yml @@ -68,3 +68,6 @@ lightmark: fgwu: full_name: Fenggang Wu fbid: 100002297362180 + +ltamasi: + full_name: Levi Tamasi diff --git a/docs/_posts/2021-04-12-universal-improvements.markdown b/docs/_posts/2021-04-12-universal-improvements.markdown new file mode 100644 index 00000000000..fa4e9d463b2 --- /dev/null +++ b/docs/_posts/2021-04-12-universal-improvements.markdown @@ -0,0 +1,46 @@ +--- +title: (Call For Contribution) Make Universal Compaction More Incremental +layout: post +author: sdong +category: blog +--- + +### Motivation + +Universal Compaction is an important compaction style, but few changes were made after we made the structure multi-leveled. Yet the major restriction of always compacting full sorted run is not relaxed. Compared to Leveled Compaction, where we usually only compile several SST files together, in universal compaction, we frequently compact GBs of data. Two issues with this gap: 1. it makes it harder to unify universal and leveled compaction; 2. periodically data is fully compacted, and in the mean time space is doubled. To ease the problem, we can break the restriction and do similar as leveled compaction, and bring it closer to unified compaction. + +We call for help for making following improvements. + + +### How Universal Compaction Works + +In universal, whole levels are compacted together to satisfy two conditions (See [wiki page](https://github.com/facebook/rocksdb/wiki/Universal-Compaction) for more details): + +1. total size / bottommost level size > a threshold, or +2. total number of sorted runs (non-0 levels + L0 files) is within a threshold + +1 is to limit extra space overhead used for dead data and 2 is for read performance. + +If 1 is triggered, likely a full compaction will be triggered. If 2 is triggered, RocksDB compact some sorted runs to bring the number down. It does it by using a simple heuristic so that less writes needed for that purpose over time: it starts from compacting smaller files, but if total size to compact is similar to or larger than size of the next level, it will take that level together, as soon on (whether it is the best heuristic is another question and we’ve never seriously looked at it). + +### How We Can Improve? + +Let’s start from condition 1. Here we do full compaction but is not necessary. A simple optimization would be to compact so that just enough files are merged into the bottommost level (Lmax) to satisfy condition 1. It would work if we only need to pick some files from Lmax-1, or if it is cheaper over time, we can pick some files from other levels too. + +Then condition 2. If we finish condition 1, there might be holes in some ranges in older levels. These holes might make it possible that only by compacting some sub ranges, we can fix the LSM-tree for condition 2. RocksDB can take single files into consideration and apply more sophisticated heuristic. + +This new approach makes universal compaction closer to leveled compaction. The operation for 1 is closer to how Leveled compaction triggeres Lmax-1 to Lmax compaction. And 2 can potentially be implemented as something similar to level picking in Leveled Compaction. In fact, all those file picking can co-existing in one single compaction style and there isn’t fundamental conflicts to that. + +### Limitation + +There are two limitations: + +* Periodic automatic full compaction is unpleasant but at the same time is pleasant in another way. Some users might uses it to reason that everything is periodically collapsed so dead data is gone and old data is rewritten. We need to make sure periodic compaction works to continue with that. +* L0 to the first non-L0 level compaction is the first time data is partitioned in LSM-tree so that incremental compaction by range is possible. We might need to do more of these compactions in order to make incremental possible, which will increase compaction slightly. +* Compacting subset of a level would introduce some extra overhead for unaligned files, just as in leveled compaction. More SST boundary cutting heuristic can reduce this overhead but it will be there. + +But I believe the benefits would outweight the limitations. Reducing temporary space doubling and moving towards to unified compaction would be important achievements. + +### Interested in Help? + +Compaction is the core of LSM-tree, but its improvements are far overdue. If you are a user of universal compaction and would be able to benefit from those improvements, we will be happy to work with you on speeding up the project and bring them to RocksDB sooner. Feel free to communicate with us in [this issue](https://github.com/facebook/rocksdb/issues/8181). diff --git a/docs/_posts/2021-05-26-integrated-blob-db.markdown b/docs/_posts/2021-05-26-integrated-blob-db.markdown new file mode 100644 index 00000000000..9f3a22fa275 --- /dev/null +++ b/docs/_posts/2021-05-26-integrated-blob-db.markdown @@ -0,0 +1,101 @@ +--- +title: Integrated BlobDB +layout: post +author: ltamasi +category: blog +--- +## Background + +BlobDB is essentially RocksDB for large-value use cases. The basic idea, which was proposed in the [WiscKey paper](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf), is key-value separation: by storing large values in dedicated blob files and storing only small pointers to them in the LSM tree, we avoid copying the values over and over again during compaction, thus reducing write amplification. Historically, BlobDB supported only FIFO and TTL based use cases that can tolerate some data loss. In addition, it was incompatible with many widely used RocksDB features, and required users to adopt a custom API. In 2020, we decided to rearchitect BlobDB from the ground up, taking the lessons learned from WiscKey and the original BlobDB but also drawing inspiration and incorporating ideas from other similar systems. Our goals were to eliminate the above limitations and to create a new integrated version that enables customers to use the well-known RocksDB API, has feature parity with the core of RocksDB, and offers better performance. This new implementation is now available and provides the following improvements over the original: + +* **API.** In contrast with the legacy BlobDB implementation, which had its own `StackableDB`-based interface (`rocksdb::blob_db::BlobDB`), the new version can be used via the well-known `rocksdb::DB` API, and can be configured simply by using a few column family options. +* **Consistency.** With the integrated BlobDB implementation, RocksDB’s consistency guarantees and various write options (like using the WAL or synchronous writes) now apply to blobs as well. Moreover, the new BlobDB keeps track of blob files in the RocksDB MANIFEST. +* **Write performance.** When using the old BlobDB, blobs are extracted and immediately written to blob files by the BlobDB layer *in the application thread*. This has multiple drawbacks from a performance perspective: first, it requires synchronization; second, it means that expensive operations like compression are performed in the application thread; and finally, it involves flushing the blob file after each blob. The new code takes a completely different approach by *offloading blob file building to RocksDB’s background jobs*, i.e. flushes and compactions. This means that similarly to SSTs, any given blob file is now written by a single background thread, eliminating the need for locking, flushing, or performing compression in the foreground. Note that this approach is also a better fit for network-based file systems where small writes might be expensive and opens up the possibility of file format optimizations that involve buffering (like dictionary compression). +* **Read performance.** The old code relies on each read (i.e. `Get`, `MultiGet`, or iterator) taking a snapshot and uses those snapshots when deciding which obsolete blob files can be removed. The new BlobDB improves this by generalizing RocksDB’s Version concept, which historically referred to the set of live SST files at a given point in time, to include the set of live blob files as well. This has performance benefits like [making the read path mostly lock-free by utilizing thread-local storage](https://rocksdb.org/blog/2014/06/27/avoid-expensive-locks-in-get.html). We have also introduced a blob file cache that can be utilized to keep frequently accessed blob files open. +* **Garbage collection.** Key-value separation means that if a key pointing to a blob gets overwritten or deleted, the blob becomes unreferenced garbage. To be able to reclaim this space, BlobDB now has garbage collection capabilities. GC is integrated into the compaction process and works by relocating valid blobs residing in old blob files as they are encountered during compaction. Blob files can be marked obsolete (and eventually deleted in one shot) once they contain nothing but garbage. This is more efficient than the method used by WiscKey, which involves performing a `Get` operation to find out whether a blob is still referenced followed by a `Put` to update the reference, which in turn results in garbage collection competing and potentially conflicting with the application’s writes. +* **Feature parity with the RocksDB core.** The new BlobDB supports way more features than the original and is near feature parity with vanilla RocksDB. In particular, we support all basic read/write APIs (with the exception of `Merge`, which is coming soon), recovery, compression, atomic flush, column families, compaction filters, checkpoints, backup/restore, transactions, per-file checksums, and the SST file manager. In addition, the new BlobDB’s options can be dynamically adjusted using the `SetOptions` interface. + +## API + +The new BlobDB can be configured (on a per-column family basis if needed) simply by using the following options: + +* `enable_blob_files`: set it to `true` to enable key-value separation. +* `min_blob_size`: values at or above this threshold will be written to blob files during flush or compaction. +* `blob_file_size`: the size limit for blob files. +* `blob_compression_type`: the compression type to use for blob files. All blobs in the same file are compressed using the same algorithm. +* `enable_blob_garbage_collection`: set this to `true` to make BlobDB actively relocate valid blobs from the oldest blob files as they are encountered during compaction. +* `blob_garbage_collection_age_cutoff`: the threshold that the GC logic uses to determine which blob files should be considered “old.” For example, the default value of 0.25 signals to RocksDB that blobs residing in the oldest 25% of blob files should be relocated by GC. This parameter can be tuned to adjust the trade-off between write amplification and space amplification. + +The above options are all dynamically adjustable via the `SetOptions` API; changing them will affect subsequent flushes and compactions but not ones that are already in progress. + +In terms of compaction styles, we recommend using leveled compaction with BlobDB. The rationale behind universal compaction in general is to provide lower write amplification at the expense of higher read amplification; however, as we will see later in the Performance section, BlobDB can provide very low write amp and good read performance with leveled compaction. Therefore, there is really no reason to take the hit in read performance that comes with universal compaction. + +In addition to the above, consider tuning the following non-BlobDB specific options: + +* `write_buffer_size`: this is the memtable size. You might want to increase it for large-value workloads to ensure that SST and blob files contain a decent number of keys. +* `target_file_size_base`: the target size of SST files. Note that even when using BlobDB, it is important to have an LSM tree with a “nice” shape and multiple levels and files per level to prevent heavy compactions. Since BlobDB extracts and writes large values to blob files, it makes sense to make this parameter significantly smaller than the memtable size. One guideline is to set `blob_file_size` to the same value as `write_buffer_size` (adjusted for compression if needed) and make `target_file_size_base` proportionally smaller based on the ratio of key size to value size. +* `max_bytes_for_level_base`: consider setting this to a multiple (e.g. 8x or 10x) of `target_file_size_base`. + +As mentioned above, the new BlobDB now also supports compaction filters. Key-value separation actually enables an optimization here: if the compaction filter of an application can make a decision about a key-value solely based on the key, it is unnecessary to read the value from the blob file. Applications can take advantage of this optimization by implementing the new `FilterBlobByKey` method of the `CompactionFilter` interface. This method gets called by RocksDB first whenever it encounters a key-value where the value is stored in a blob file. If this method returns a “final” decision like `kKeep`, `kRemove`, `kChangeValue`, or `kRemoveAndSkipUntil`, RocksDB will honor that decision; on the other hand, if the method returns `kUndetermined`, RocksDB will read the blob from the blob file and call `FilterV2` with the value in the usual fashion. + +## Performance + +We tested the performance of the new BlobDB for six different value sizes between 1 KB and 1 MB using a customized version of our [standard benchmark suite](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) on a box with an 18-core Skylake DE CPU (running at 1.6 GHz, with hyperthreading enabled), 64 GB RAM, a 512 GB boot SSD, and two 1.88 TB M.2 SSDs in a RAID0 configuration for data. The RocksDB version used was equivalent to 6.18.1, with some benchmarking and statistics related enhancements. Leveled and universal compaction without key-value separation were used as reference points. Note that for simplicity, we use “leveled compaction” and “universal compaction” as shorthand for leveled and universal compaction without key-value separation, respectively, and “BlobDB” for BlobDB with leveled compaction. + +Our benchmarks cycled through six different workloads: two write-only ones (initial load and overwrite), two read/write ones (point lookup/write mix and range scan/write mix), and finally two read-only ones (point lookups and range scans). The first two phases performed a fixed amount of work (see below), while the final four were run for a fixed amount of time, namely 30 minutes each. Each phase other than the first one started with the database state left behind by the previous one. Here’s a brief description of the workloads: + +* **Initial load**: this workload has two distinct stages, a single-threaded random write stage during which compactions are disabled (so all data is flushed to L0, where it remains for the rest of the stage), followed by a full manual compaction. The random writes are performed with load-optimized settings, namely using the vector memtable implementation and with concurrent memtable writes and WAL disabled. This stage was used to populate the database with 1 TB worth of raw values, e.g. 2^30 (~1 billion) 1 KB values or 2^20 (~1 million) 1 MB values. +* **Overwrite**: this is a multi-threaded random write workload using the usual skiplist memtable, with compactions, WAL, and concurrent memtable writes enabled. In our tests, 16 writer threads were used. The total number of writes was set to the same number as in the initial load stage and split up evenly between the writer threads. For instance, for the 1 MB value size, we had 2^20 writes divided up between the 16 threads, resulting in each thread performing 2^16 write operations. At the end of this phase, a “wait for compactions” step was added to prevent this workload from exhibiting artificially low write amp or conversely, the next phase showing inflated write amp. +* **Point lookup/write mix**: a single writer thread performing random writes while N (in our case, 16) threads perform random point lookups. WAL is enabled and all writes are synced. +* **Range scan/write mix**: similar to the above, with one writer thread and N reader threads (where N was again set to 16 in our tests). The reader threads perform random range scans, with 10 `Next` calls per `Seek`. Again, WAL is enabled, and sync writes are used. +* **Point lookups (read-only)**: N=16 threads perform random point lookups. +* **Range scans (read-only)**: N=16 threads execute random range scans, with 10 `Next`s per `Seek` like above. + +With that out of the way, let’s see how the new BlobDB performs against traditional leveled and universal compaction. In the next few sections, we’ll be looking at write amplification as well as read and write performance. We’ll also briefly compare the write performance of the new BlobDB with the legacy implementation. + +### Write amplification + +Reducing write amp is the original motivation for key-value separation. Here, we follow RocksDB’s definition of write amplification (as used in compaction statistics and the info log). That is, we define write amp as the total amount of data written by flushes and compactions divided by the amount of data written by flushes, where “data written” includes SST files and blob files as well (if applicable). The following charts show that BlobDB significantly reduces write amplification for all of our (non-read only) workloads. + +For the initial load, where due to the nature of the workload both leveled and universal already have a low write amp factor of 1.6, BlobDB has a write amp close to the theoretical minimum of 1.0, namely in the 1.0..1.02 range, depending on value size. How is this possible? Well, the trick is that when key-value separation is used, the full compaction step only has to sort the keys but not the values. This results in a write amp that is about **36% lower** than the already low write amp you get with either leveled or universal. + +In the case of the overwrite workload, BlobDB had a write amp between 1.4 and 1.7 depending on value size. This is around **75-78% lower** than the write amp of leveled compaction (6.1 to 6.8) and **70-77% lower** than universal (5.7 to 6.2); for this workload, there wasn’t a huge difference between the performance of leveled and universal. + +When it comes to the point lookup/write mix workload, BlobDB had a write amp between 1.4 and 1.8. This is **83-88% lower** than the write amp of leveled compaction, which had values between 10.8 and 12.5. Universal fared much better than leveled under this workload, and had write amp in the 2.2..6.6 range; however, BlobDB still provided significant gains for all value sizes we tested: namely, write amp was **18-77% lower** than that of universal, depending on value size. + +As for the range scan/write mix workload, BlobDB again had a write amp between 1.4 and 1.8, while leveled had values between 13.6 and 14.9, and universal was between 2.8 and 5.0. In other words, BlobDB’s write amp was **88-90% lower** than that of leveled, and **46-70% lower** than that of universal. + +![Write amplification](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Write performance + +In terms of write performance, there are other factors to consider besides write amplification. The following charts show some interesting metrics for the two write-only workloads (initial load and overwrite). As discussed earlier, these two workloads perform a fixed amount of work; the two charts in the top row show how long it took BlobDB, leveled, and universal to complete that work. Note that each bar is broken down into two, corresponding to the two stages of each workload (random write and full compaction for initial load, and random write and waiting for compactions for overwrite). + +For initial load, note that the random write stage takes the same amount of time regardless of which algorithm is used. This is not surprising considering the fact that compactions are disabled during this stage and thus RocksDB is simply writing L0 files (and in BlobDB’s case, blob files) as fast as it can. The second stage, on the other hand, is very different: as mentioned above, BlobDB essentially only needs to read, sort, and rewrite the keys during compaction, which can be done much much faster (with 1 MB values, more than a hundred times faster) than doing the same for large key-values. Due to this, initial load completed **2.3x to 4.7x faster** overall when using BlobDB. + +As for the overwrite workload, BlobDB performs much better during both stages. The two charts in the bottom row help explain why. In the case of both leveled and universal compaction, compactions can’t keep up with the write rate, which eventually leads to back pressure in the form of write stalls. As shown in the chart below, both leveled and universal stall between ~40% and ~70% of the time; on the other hand, BlobDB is stall-free except for the largest value size tested (1 MB). This naturally leads to higher throughput, namely **2.1x to 3.5x higher** throughput compared to leveled, and **1.6x to 3.0x higher** throughput compared to universal. The overwrite time chart also shows that the catch-up stage that waits for all compactions to finish is much shorter (and in fact, at larger value sizes, negligible) with BlobDB. + +![Write performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Read/write and read-only performance + +The charts below show the read performance (in terms of operations per second) of BlobDB versus leveled and universal compaction under the two read/write workloads and the two read-only workloads. BlobDB meets or exceeds the read performance of leveled compaction, except for workloads involving range scans at the two smallest value sizes tested (1 KB and 4 KB). It also provides better (in some cases, much better) read performance than universal across the board. In particular, BlobDB provides up **1.4x higher** read performance than leveled (for larger values), and up to **5.6x higher** than universal. + +![Read-write and read-only performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Comparing the two BlobDB implementations + +To compare the write performance of the new BlobDB with the legacy implementation, we ran two versions of the first (single-threaded random write) stage of the initial load benchmark using 1 KB values: one with WAL disabled, and one with WAL enabled. The new implementation completed the load **4.6x faster** than the old one without WAL, and **2.3x faster** with WAL. + +![Comparing the two BlobDB implementations](/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Future work + +There are a few remaining features that are not yet supported by the new BlobDB. The most important one is `Merge` (and the related `GetMergeOperands` API); in addition, we don’t currently support the `EventListener` interface, the `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` APIs, secondary instances, and ingestion of blob files. We will continue to work on closing this gap. + +We also have further plans when it comes to performance. These include optimizing garbage collection, introducing a dedicated cache for blobs, improving iterator and `MultiGet` performance, and evolving the blob file format amongst others. + diff --git a/docs/_posts/2021-05-26-online-validation.markdown b/docs/_posts/2021-05-26-online-validation.markdown new file mode 100644 index 00000000000..33e9dfc151a --- /dev/null +++ b/docs/_posts/2021-05-26-online-validation.markdown @@ -0,0 +1,17 @@ +--- +title: Online Validation +layout: post +author: sdong +category: blog +--- +To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones. + +We improved ColumnFamilyOptions::force_consistency_checks and enabled it by default. The option does some basic consistency checks to LSM-tree, e.g., files in one level are not overlapping. The DB will be frozen from new writes if a violation is detected. Previously, the feature’s check was too limited and didn’t always freeze the DB in a timely manner. Last year, we made the checking stricter so that it can [catch much more corrupted LSM-tree structures](https://github.com/facebook/rocksdb/pull/6901). We also fixed several issues where the checking failure was swallowed without freezing the DB. After making force_consistency_checks more reliable, we changed the default value to be on. + +ColumnFamilyOptions::paranoid_file_checks does some more expensive extra checking when generating a new SST file. Last year, we advanced coverage to this feature: after every SST file is generated, the SST file is created, read back keys one by one and check two things: (1) the keys are in comparator order (also available and enabled by default during file write via ColumnFamilyOptions::check_flush_compaction_key_order); (2) the hash of all the KVs is the same as calculated when we add KVs into it. These checks detect certain corruptions so we can prevent the corrupt files from being applied to the DB. We suggest users turn it on at least in shadow environments, and consider to run it in production too if you can afford the overheads. + +A recent feature is added to check the count of entries added into memtable while flushing it into an SST file. This feature is to have some online coverage to memtable corruption, caused by either software bug or hardware issue. This feature will be released in the coming release (6.21) and by default on. In the future, we will check more counters during memtables, e.g. number of puts or number of deletes. + +We also improved the reporting of online validation errors to improve debuggability. For example, failure to parse a corrupt key now reports details about the corrupt key. Since we did not want to expose key data in logs, error messages, etc., by default, this reporting is opt-in via DBOptions::allow_data_in_errors. + +More online checking features are planned and some are more sophisticated, including key/value checksums and sample based query validation. diff --git a/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown b/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown new file mode 100644 index 00000000000..422554a30cc --- /dev/null +++ b/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown @@ -0,0 +1,195 @@ +--- +title: RocksDB Secondary Cache +layout: post +author: anand1976 +category: blog +--- +## Introduction + +The RocksDB team is implementing support for a block cache on non-volatile media, such as a local flash device or NVM/SCM. It can be viewed as an extension of RocksDB’s current volatile block cache (LRUCache or ClockCache). The non-volatile block cache acts as a second tier cache that contains blocks evicted from the volatile cache. Those blocks are then promoted to the volatile cache as they become hotter due to access. + +This feature is meant for cases where the DB is located on remote storage or cloud storage. The non-volatile cache is officially referred to in RocksDB as the SecondaryCache. By maintaining a SecondaryCache that’s an order of magnitude larger than DRAM, fewer reads would be required from remote storage, thus reducing read latency as well as network bandwidth consumption. + +From the user point of view, the local flash cache will support the following requirements - + +1. Provide a pointer to a secondary cache when opening a DB +2. Be able to share the secondary cache across DBs in the same process +3. Have multiple secondary caches on a host +4. Support persisting the cache across process restarts and reboots by ensuring repeatability of the cache key + +![Architecture](/static/images/rocksdb-secondary-cache/arch_diagram.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Design + +When designing the API for a SecondaryCache, we had a choice between making it visible to the RocksDB code (table reader) or hiding it behind the RocksDB block cache. There are several advantages of hiding it behind the block cache - + +* Allows flexibility in insertion of blocks into the secondary cache. A block can be inserted on eviction from the RAM tier, or it could be eagerly inserted. +* It makes the rest of the RocksDB code less complex by providing a uniform interface regardless of whether a secondary cache is configured or not +* Makes parallel reads, peeking in the cache for prefetching, failure handling etc. easier +* Makes it easier to extend to compressed data if needed, and allows other persistent media, such as PM, to be added as an additional tier + + +We decided to make the secondary cache transparent to the rest of RocksDB code by hiding it behind the block cache. A key issue that we needed to address was the allocation and ownership of memory of the cached items - insertion into the secondary cache may require that memory be allocated by the same. This means that parts of the cached object that can be transferred to the secondary cache needs to be copied out (referred to as **unpacking**), and on a lookup the data stored in the secondary cache needs to be provided to the object constructor (referred to as **packing**). For RocksDB cached objects such as data blocks, index and filter blocks, and compression dictionaries, unpacking involves copying out the raw uncompressed BlockContents of the block, and packing involves constructing the corresponding block/index/filter/dictionary object using the raw uncompressed data. + +Another alternative we considered was the existing PersistentCache interface. However, we decided to not pursue it and eventually deprecate it for the following reasons - +* It is exposed directly to the table reader code, which makes it more difficult to implement different policies such as inclusive/exclusive cache, as well as extending it to more sophisticated admission control policies +* The interface does not allow for custom memory allocation and object packing/unpacking, so new APIs would have to be defined anyway +* The current PersistentCache implementation is very simple and does not have any admission control policies + +## API + +The interface between RocksDB’s block cache and the secondary cache is designed to allow pluggable implementations. For FB internal usage, we plan to use Cachelib with a wrapper to provide the plug-in implementation and use folly and other fbcode libraries, which cannot be used directly by RocksDB, to efficiently implement the cache operations. The following diagrams show the flow of insertion and lookup of a block. + +![Insert flow](/static/images/rocksdb-secondary-cache/insert_flow.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +![Lookup flow](/static/images/rocksdb-secondary-cache/lookup_flow.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +An item in the secondary cache is referenced by a SecondaryCacheHandle. The handle may not be immediately ready or have a valid value. The caller can call IsReady() to determine if its ready, and can call Wait() in order to block until it becomes ready. The caller must call Value() after it becomes ready to determine if the item was successfully read. Value() must return nullptr on failure. + +``` +class SecondaryCacheHandle { + public: + virtual ~SecondaryCacheHandle() {} + + // Returns whether the handle is ready or not + virtual bool IsReady() = 0; + + // Block until handle becomes ready + virtual void Wait() = 0; + + // Return the value. If nullptr, it means the lookup was unsuccessful + virtual void* Value() = 0; + + // Return the size of value + virtual size_t Size() = 0; +}; +``` + +The user of the secondary cache (for example, BlockBasedTableReader indirectly through LRUCache) must implement the callbacks defined in CacheItemHelper, in order to facilitate the unpacking/packing of objects for saving to and restoring from the secondary cache. The CreateCallback must be implemented to construct a cacheable object from the raw data in secondary cache. + +``` + // The SizeCallback takes a void* pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + using SizeCallback = size_t (*)(void* obj); + + // The SaveToCallback takes a void* object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(void* from_obj, size_t from_offset, + size_t length, void* out); + + // A function pointer type for custom destruction of an entry's + // value. The Cache is responsible for copying and reclaiming space + // for the key, but values are managed by the caller. + using DeleterFn = void (*)(const Slice& key, void* value); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + SizeCallback size_cb; + SaveToCallback saveto_cb; + DeleterFn del_cb; + + CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {} + CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb, + DeleterFn _del_cb) + : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {} + }; + + // The CreateCallback is passed by the block cache user to Lookup(). It + // takes in a buffer from the NVM cache and constructs an object using + // it. The callback doesn't have ownership of the buffer and should + // copy the contents into its own buffer. + // typedef std::function + // CreateCallback; + using CreateCallback = std::function; +``` + +The secondary cache provider must provide a concrete implementation of the SecondaryCache abstract class. + +``` +// SecondaryCache +// +// Cache interface for caching blocks on a secondary tier (which can include +// non-volatile media, or alternate forms of caching such as compressed data) +class SecondaryCache { + public: + virtual ~SecondaryCache() {} + + virtual std::string Name() = 0; + + static const std::string Type() { return "SecondaryCache"; } + + // Insert the given value into this cache. The value is not written + // directly. Rather, the SaveToCallback provided by helper_cb will be + // used to extract the persistable data in value, which will be written + // to this tier. The implementation may or may not write it to cache + // depending on the admission control policy, even if the return status is + // success. + virtual Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) = 0; + + // Lookup the data for the given key in this cache. The create_cb + // will be used to create the object. The handle returned may not be + // ready yet, unless wait=true, in which case Lookup() will block until + // the handle is ready + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0; + + // At the discretion of the implementation, erase the data associated + // with key + virtual void Erase(const Slice& key) = 0; + + // Wait for a collection of handles to become ready. This would be used + // by MultiGet, for example, to read multitple data blocks in parallel + virtual void WaitAll(std::vector handles) = 0; + + virtual std::string GetPrintableOptions() const = 0; +}; +``` + +A SecondaryCache is configured by the user by providing a pointer to it in LRUCacheOptions - +``` +struct LRUCacheOptions { + ... + // A SecondaryCache instance to use as an additional cache tier + std::shared_ptr secondary_cache; + ... +}; +``` + +## Current Status + +The initial RocksDB support for the secondary cache has been merged into the master branch, and will be available in the 6.21 release. This includes providing a way for the user to configure a secondary cache when instantiating RocksDB’s LRU cache (volatile block cache), spilling blocks evicted from the LRU cache to the flash cache, promoting a block read from the SecondaryCache to the LRU cache, update tools such as cache_bench and db_bench to specify a flash cache. The relevant PRs are [#8271](https://github.com/facebook/rocksdb/pull/8271), [#8191](https://github.com/facebook/rocksdb/pull/8191), and [#8312](https://github.com/facebook/rocksdb/pull/8312). + +We prototyped an end-to-end solution, with the above PRs as well as a Cachelib based implementation of the SecondaryCache. We ran a mixgraph benchmark to simulate a realistic read/write workload. The results showed a 15% gain with the local flash cache over no local cache, and a ~25-30% reduction in network reads with a corresponding decrease in cache misses. + +![Throughput](/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +![Hit Rate](/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Future Work + +In the short term, we plan to do the following in order to fully integrate the SecondaryCache with RocksDB - + +1. Use DB session ID as the cache key prefix to ensure uniqueness and repeatability +2. Optimize flash cache usage of MultiGet and iterator workloads +3. Stress testing +4. More benchmarking + +Longer term, we plan to deploy this in production at Facebook. + +## Call to Action + +We are hoping for a community contribution of a secondary cache implementation, which would make this feature usable by the broader RocksDB userbase. If you are interested in contributing, please reach out to us in [this issue](https://github.com/facebook/rocksdb/issues/8347). + diff --git a/docs/_posts/2021-05-31-dictionary-compression.markdown b/docs/_posts/2021-05-31-dictionary-compression.markdown new file mode 100644 index 00000000000..9b0f4529344 --- /dev/null +++ b/docs/_posts/2021-05-31-dictionary-compression.markdown @@ -0,0 +1,157 @@ +--- +title: Preset Dictionary Compression +layout: post +author: ajkr +category: blog +--- + +## Summary + +Compression algorithms relying on an adaptive dictionary, such as LZ4, zstd, and zlib, struggle to achieve good compression ratios on small inputs when using the basic compress API. +With the basic compress API, the compressor starts with an empty dictionary. +With small inputs, not much content gets added to the dictionary during the compression. +Combined, these factors suggest the dictionary will never have enough contents to achieve great compression ratios. + +RocksDB groups key-value pairs into data blocks before storing them in files. +For use cases that are heavy on random accesses, smaller data block size is sometimes desirable for reducing I/O and CPU spent reading blocks. +However, as explained above, smaller data block size comes with the downside of worse compression ratio when using the basic compress API. + +Fortunately, zstd and other libraries offer advanced compress APIs that preset the dictionary. +A preset dictionary makes it possible for the compressor to start from a useful state instead of from an empty one, making compression immediately effective. + +RocksDB now optionally takes advantage of these dictionary presetting APIs. +The challenges in integrating this feature into the storage engine were more substantial than apparent on the surface. +First, we need to target a preset dictionary to the relevant data. +Second, preset dictionaries need to be trained from data samples, which need to be gathered. +Third, preset dictionaries need to be persisted since they are needed at decompression time. +Fourth, overhead in accessing the preset dictionary must be minimized to prevent regression in critical code paths. +Fifth, we need easy-to-use measurement to evaluate candidate use cases and production impact. + +In production, we have deployed dictionary presetting to save space in multiple RocksDB use cases with data block size 8KB or smaller. +We have measured meaningful benefit to compression ratio in use cases with data block size up to 16KB. +We have also measured a use case that can save both CPU and space by reducing data block size and turning on dictionary presetting at the same time. + +## Feature design +#### Targeting + +Over time we have considered a few possibilities for the scope of a dictionary. + +- Subcompaction +- SST file +- Column family + +The original choice was subcompaction scope. +This enabled an approach with minimal buffering overhead because we could collect samples while generating the first output SST file. +The dictionary could then be trained and applied to subsequent SST files in the same subcompaction. + +However, we found a large use case where the proximity of data in the keyspace was more correlated with its similarity than we had predicted. +In particular, the approach of training a dictionary on an adjacent file yielded substantially worse ratios than training the dictionary on the same file it would be used to compress. +In response to this finding, we changed the preset dictionary scope to per SST file. + +With this change in approach, we had to face the problem we had hoped to avoid: how can we compress all of an SST file's data blocks with the same preset dictionary while that dictionary can only be trained after many data blocks have been sampled? +The solutions we considered both involved a new overhead. +We could read the input more than once and introduce I/O overhead, or we could buffer the uncompressed output file data blocks until a dictionary is trained, introducing memory overhead. +We chose to take the hit on memory overhead. + +Another approach that we considered was associating multiple dictionaries with a column family. +For example, in MyRocks there could be a dictionary trained on data from each large table. +When compressing a data block, we would look at the table to which its data belongs and pick the corresponding dictionary. +However, this approach would introduce many challenges. +RocksDB would need to be aware of the key schema to know where are the table boundaries. +RocksDB would also need to periodically update the dictionaries to account for changes in data pattern. +It would need somewhere to store dictionaries at column family scope. +Overall, we thought these challenges were too difficult to pursue the approach. + +#### Training + +![](/static/images/dictcmp/dictcmp_raw_sampled.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} +

+Raw samples mode (`zstd_max_train_bytes == 0`) +

+ +As mentioned earlier, the approach we took is to build the dictionary from buffered uncompressed data blocks. +The first row of data blocks in these diagrams illustrate this buffering. +The second row illustrates training samples selected from the buffered blocks. +In raw samples mode (above), the final dictionary is simply the concatenation of these samples. +Whereas, in zstd training mode (below), these samples will be passed to the trainer to produce the final dictionary. + +![](/static/images/dictcmp/dictcmp_zstd_trained.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} +

+zstd training mode (`zstd_max_train_bytes > 0`) +

+ +#### Compression path + +Once the preset dictionary is generated by the above process, we apply it to the buffered data blocks and write them to the output file. +Thereafter, newly generated data blocks are immediately compressed and written out. + +One optimization here is available to zstd v0.7.0+ users. +Instead of deserializing the dictionary on each compress invocation, we can do that work once and reuse it. +A `ZSTD_CDict` holds this digested dictionary state and is passed to the compress API. + +#### Persistence + +When an SST file's data blocks are compressed using a preset dictionary, that dictionary is stored inside the file for later use in decompression. + +![](/static/images/dictcmp/dictcmp_sst_blocks.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} +

+SST file layout with the preset dictionary in its own (uncompressed) block +

+ +#### Decompression path + +To decompress, we need to provide both the data block and the dictionary used to compress it. +Since dictionaries are just blocks in a file, we access them through block cache. +However this additional load on block cache can be problematic. +It can be alleviated by pinning the dictionaries to avoid going through the LRU locks. + +An optimization analogous to the digested dictionary exists for certain zstd users (see User API section for details). +When enabled, the block cache stores the digested dictionary state for decompression (`ZSTD_DDict`) instead of the block contents. +In some cases we have seen decompression CPU decrease overall when enabling dictionary thanks to this optimization. + +#### Measurement + +Typically our first step in evaluating a candidate use case is an offline analysis of the data. +This gives us a quick idea whether presetting dictionary will be beneficial without any code, config, or data changes. +Our `sst_dump` tool reports what size SST files would have been using specified compression libraries and options. +We can select random SST files and compare the size with vs. without dictionary. + +When that goes well, the next step is to see how it works in a live DB, like a production shadow or canary. +There we can observe how it affects application/system metrics. + +Even after dictionary is enabled, there is the question of how much space was finally saved. +We provide a way to A/B test size with vs. without dictionary while running in production. +This feature picks a sample of data blocks to compress in multiple ways -- one of the outputs is stored, while the other outputs are thrown away after counting their size. +Due to API limitations, the stored output always has to be the dictionary-compressed one, so this feature can only be used after enabling dictionary. +The size with and without dictionary are stored in the SST file as table properties. +These properties can be aggregated across all SST files in a DB (and across all DBs in a tier) to learn the final space saving. + +## User API + +RocksDB allows presetting compression dictionary for users of LZ4, zstd, and zlib. +The most advanced capabilities are available to zstd v1.1.4+ users who statically link (see below). +Newer versions of zstd (v1.3.6+) have internal changes to the dictionary trainer and digested dictionary management, which significantly improve memory and CPU efficiency. + +Run-time settings: + +- `CompressionOptions::max_dict_bytes`: Limit on per-SST file dictionary size. Increasing this causes dictionaries to consume more space and memory for the possibility of better data block compression. A typical value we use is 16KB. +- (**zstd only**) `CompressionOptions::zstd_max_train_bytes`: Limit on training data passed to zstd dictionary trainer. Larger values cause the training to consume more CPU (and take longer) while generating more effective dictionaries. The starting point guidance we received from zstd team is to set it to 100x `CompressionOptions::max_dict_bytes`. +- `CompressionOptions::max_dict_buffer_bytes`: Limit on data buffering from which training samples are gathered. By default we buffer up to the target file size per ongoing background job. If this amount of memory is concerning, this option can constrain the buffering with the downside that training samples will cover a smaller portion of the SST file. Work is ongoing to charge this memory usage to block cache so it will not need to be accounted for separately. +- `BlockBasedTableOptions::cache_index_and_filter_blocks`: Controls whether metadata blocks including dictionary are accessed through block cache or held in table reader memory (yes, its name is outdated). +- `BlockBasedTableOptions::metadata_cache_options`: Controls what metadata blocks are pinned in block cache. Pinning avoids LRU contention at the risk of cold blocks holding memory. +- `ColumnFamilyOptions::sample_for_compression`: Controls frequency of measuring extra compressions on data blocks using various libraries with default settings (i.e., without preset dictionary). + +Compile-time setting: + +- (**zstd only**) `EXTRA_CXXFLAGS=-DZSTD_STATIC_LINKING_ONLY`: Hold digested dictionaries in block cache to save repetitive deserialization overhead. This saves a lot of CPU for read-heavy workloads. This compiler flag is necessary because one of the digested dictionary APIs we use is marked as experimental. We still use it in production, however. + +Function: + +- `DB::GetPropertiesOfAllTables()`: The properties `kSlowCompressionEstimatedDataSize` and `kFastCompressionEstimatedDataSize` estimate what the data block size (`kDataSize`) would have been if the corresponding compression library had been used. These properties are only present when `ColumnFamilyOptions::sample_for_compression` causes one or more samples to be measured, and they become more accurate with higher sampling frequency. + +Tool: + +- `sst_dump --command=recompress`: Offline analysis tool that reports what the SST file size would have been using the specified compression library and options. diff --git a/docs/static/images/dictcmp/dictcmp_raw_sampled.png b/docs/static/images/dictcmp/dictcmp_raw_sampled.png new file mode 100644 index 00000000000..2eb6463c248 Binary files /dev/null and b/docs/static/images/dictcmp/dictcmp_raw_sampled.png differ diff --git a/docs/static/images/dictcmp/dictcmp_sst_blocks.png b/docs/static/images/dictcmp/dictcmp_sst_blocks.png new file mode 100644 index 00000000000..551860b2e9b Binary files /dev/null and b/docs/static/images/dictcmp/dictcmp_sst_blocks.png differ diff --git a/docs/static/images/dictcmp/dictcmp_zstd_trained.png b/docs/static/images/dictcmp/dictcmp_zstd_trained.png new file mode 100644 index 00000000000..966c7fe0f6c Binary files /dev/null and b/docs/static/images/dictcmp/dictcmp_zstd_trained.png differ diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png new file mode 100644 index 00000000000..7215390cb5f Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png differ diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png new file mode 100644 index 00000000000..f412ee60f09 Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png differ diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png new file mode 100644 index 00000000000..19f40b035ae Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png differ diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png new file mode 100644 index 00000000000..a1d43da0c24 Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png differ diff --git a/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png b/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png new file mode 100644 index 00000000000..10fa7372825 Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png differ diff --git a/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png b/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png new file mode 100644 index 00000000000..df2e333f9cd Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png differ diff --git a/docs/static/images/rocksdb-secondary-cache/arch_diagram.png b/docs/static/images/rocksdb-secondary-cache/arch_diagram.png new file mode 100644 index 00000000000..696a376ed8a Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/arch_diagram.png differ diff --git a/docs/static/images/rocksdb-secondary-cache/insert_flow.png b/docs/static/images/rocksdb-secondary-cache/insert_flow.png new file mode 100644 index 00000000000..f02e7e4c509 Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/insert_flow.png differ diff --git a/docs/static/images/rocksdb-secondary-cache/lookup_flow.png b/docs/static/images/rocksdb-secondary-cache/lookup_flow.png new file mode 100644 index 00000000000..2b3c70edb49 Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/lookup_flow.png differ diff --git a/env/composite_env.cc b/env/composite_env.cc new file mode 100644 index 00000000000..0d70855af51 --- /dev/null +++ b/env/composite_env.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "env/composite_env_wrapper.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +// The CompositeEnvWrapper class provides an interface that is compatible +// with the old monolithic Env API, and an implementation that wraps around +// the new Env that provides threading and other OS related functionality, and +// the new FileSystem API that provides storage functionality. By +// providing the old Env interface, it allows the rest of RocksDB code to +// be agnostic of whether the underlying Env implementation is a monolithic +// Env or an Env + FileSystem. In the former case, the user will specify +// Options::env only, whereas in the latter case, the user will specify +// Options::env and Options::file_system. + +class CompositeSequentialFileWrapper : public SequentialFile { + public: + explicit CompositeSequentialFileWrapper( + std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Read(size_t n, Slice* result, char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(n, io_opts, result, scratch, &dbg); + } + Status Skip(uint64_t n) override { return target_->Skip(n); } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg); + } + + private: + std::unique_ptr target_; +}; + +class CompositeRandomAccessFileWrapper : public RandomAccessFile { + public: + explicit CompositeRandomAccessFileWrapper( + std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + IOOptions io_opts; + IODebugContext dbg; + std::vector fs_reqs; + Status status; + + fs_reqs.resize(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].offset = reqs[i].offset; + fs_reqs[i].len = reqs[i].len; + fs_reqs[i].scratch = reqs[i].scratch; + fs_reqs[i].status = IOStatus::OK(); + } + status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg); + for (size_t i = 0; i < num_reqs; ++i) { + reqs[i].result = fs_reqs[i].result; + reqs[i].status = fs_reqs[i].status; + } + return status; + } + Status Prefetch(uint64_t offset, size_t n) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Prefetch(offset, n, io_opts, &dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((FSRandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + private: + std::unique_ptr target_; +}; + +class CompositeWritableFileWrapper : public WritableFile { + public: + explicit CompositeWritableFileWrapper(std::unique_ptr& t) + : target_(std::move(t)) {} + + Status Append(const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, &dbg); + } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Append(data, io_opts, verification_info, &dbg); + } + Status PositionedAppend(const Slice& data, uint64_t offset) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, &dbg); + } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->PositionedAppend(data, offset, io_opts, verification_info, + &dbg); + } + Status Truncate(uint64_t size) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Truncate(size, io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->GetFileSize(io_opts, &dbg); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + Status InvalidateCache(size_t offset, size_t length) override { + return target_->InvalidateCache(offset, length); + } + + Status RangeSync(uint64_t offset, uint64_t nbytes) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->RangeSync(offset, nbytes, io_opts, &dbg); + } + + void PrepareWrite(size_t offset, size_t len) override { + IOOptions io_opts; + IODebugContext dbg; + target_->PrepareWrite(offset, len, io_opts, &dbg); + } + + Status Allocate(uint64_t offset, uint64_t len) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Allocate(offset, len, io_opts, &dbg); + } + + std::unique_ptr* target() { return &target_; } + + private: + std::unique_ptr target_; +}; + +class CompositeRandomRWFileWrapper : public RandomRWFile { + public: + explicit CompositeRandomRWFileWrapper(std::unique_ptr& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + Status Write(uint64_t offset, const Slice& data) override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Write(offset, data, io_opts, &dbg); + } + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Read(offset, n, io_opts, result, scratch, &dbg); + } + Status Flush() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Flush(io_opts, &dbg); + } + Status Sync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Sync(io_opts, &dbg); + } + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + Status Close() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Close(io_opts, &dbg); + } + + private: + std::unique_ptr target_; +}; + +class CompositeDirectoryWrapper : public Directory { + public: + explicit CompositeDirectoryWrapper(std::unique_ptr& target) + : target_(std::move(target)) {} + + Status Fsync() override { + IOOptions io_opts; + IODebugContext dbg; + return target_->Fsync(io_opts, &dbg); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr target_; +}; +} // namespace + +Status CompositeEnv::NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeSequentialFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomAccessFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeRandomAccessFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewWritableFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr file; + status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file, + &dbg); + if (status.ok()) { + result->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* r, + const EnvOptions& options) { + IODebugContext dbg; + Status status; + std::unique_ptr file; + status = file_system_->ReuseWritableFile(fname, old_fname, + FileOptions(options), &file, &dbg); + if (status.ok()) { + r->reset(new CompositeWritableFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + IODebugContext dbg; + std::unique_ptr file; + Status status; + status = + file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg); + if (status.ok()) { + result->reset(new CompositeRandomRWFileWrapper(file)); + } + return status; +} + +Status CompositeEnv::NewDirectory(const std::string& name, + std::unique_ptr* result) { + IOOptions io_opts; + IODebugContext dbg; + std::unique_ptr dir; + Status status; + status = file_system_->NewDirectory(name, io_opts, &dir, &dbg); + if (status.ok()) { + result->reset(new CompositeDirectoryWrapper(dir)); + } + return status; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h index 7a0da5c3e26..c4df652f921 100644 --- a/env/composite_env_wrapper.h +++ b/env/composite_env_wrapper.h @@ -7,280 +7,24 @@ #include "rocksdb/env.h" #include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" -namespace ROCKSDB_NAMESPACE { - -// The CompositeEnvWrapper class provides an interface that is compatible -// with the old monolithic Env API, and an implementation that wraps around -// the new Env that provides threading and other OS related functionality, and -// the new FileSystem API that provides storage functionality. By -// providing the old Env interface, it allows the rest of RocksDB code to -// be agnostic of whether the underlying Env implementation is a monolithic -// Env or an Env + FileSystem. In the former case, the user will specify -// Options::env only, whereas in the latter case, the user will specify -// Options::env and Options::file_system. - -class CompositeSequentialFileWrapper : public SequentialFile { - public: - explicit CompositeSequentialFileWrapper( - std::unique_ptr& target) - : target_(std::move(target)) {} - - Status Read(size_t n, Slice* result, char* scratch) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Read(n, io_opts, result, scratch, &dbg); - } - Status Skip(uint64_t n) override { return target_->Skip(n); } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - Status InvalidateCache(size_t offset, size_t length) override { - return target_->InvalidateCache(offset, length); - } - Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg); - } - - private: - std::unique_ptr target_; -}; - -class CompositeRandomAccessFileWrapper : public RandomAccessFile { - public: - explicit CompositeRandomAccessFileWrapper( - std::unique_ptr& target) - : target_(std::move(target)) {} - - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Read(offset, n, io_opts, result, scratch, &dbg); - } - Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { - IOOptions io_opts; - IODebugContext dbg; - std::vector fs_reqs; - Status status; - - fs_reqs.resize(num_reqs); - for (size_t i = 0; i < num_reqs; ++i) { - fs_reqs[i].offset = reqs[i].offset; - fs_reqs[i].len = reqs[i].len; - fs_reqs[i].scratch = reqs[i].scratch; - fs_reqs[i].status = IOStatus::OK(); - } - status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg); - for (size_t i = 0; i < num_reqs; ++i) { - reqs[i].result = fs_reqs[i].result; - reqs[i].status = fs_reqs[i].status; - } - return status; - } - Status Prefetch(uint64_t offset, size_t n) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Prefetch(offset, n, io_opts, &dbg); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - }; - void Hint(AccessPattern pattern) override { - target_->Hint((FSRandomAccessFile::AccessPattern)pattern); - } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - Status InvalidateCache(size_t offset, size_t length) override { - return target_->InvalidateCache(offset, length); - } - - private: - std::unique_ptr target_; -}; - -class CompositeWritableFileWrapper : public WritableFile { - public: - explicit CompositeWritableFileWrapper(std::unique_ptr& t) - : target_(std::move(t)) {} - - Status Append(const Slice& data) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Append(data, io_opts, &dbg); - } - Status PositionedAppend(const Slice& data, uint64_t offset) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->PositionedAppend(data, offset, io_opts, &dbg); - } - Status Truncate(uint64_t size) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Truncate(size, io_opts, &dbg); - } - Status Close() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Close(io_opts, &dbg); - } - Status Flush() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Flush(io_opts, &dbg); - } - Status Sync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Sync(io_opts, &dbg); - } - Status Fsync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); - } - bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } - - bool use_direct_io() const override { return target_->use_direct_io(); } - - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - - void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { - target_->SetWriteLifeTimeHint(hint); - } - - Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { - return target_->GetWriteLifeTimeHint(); - } - - uint64_t GetFileSize() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->GetFileSize(io_opts, &dbg); - } - - void SetPreallocationBlockSize(size_t size) override { - target_->SetPreallocationBlockSize(size); - } - - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override { - target_->GetPreallocationStatus(block_size, last_allocated_block); - } - - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - } - - Status InvalidateCache(size_t offset, size_t length) override { - return target_->InvalidateCache(offset, length); - } - - Status RangeSync(uint64_t offset, uint64_t nbytes) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->RangeSync(offset, nbytes, io_opts, &dbg); - } - - void PrepareWrite(size_t offset, size_t len) override { - IOOptions io_opts; - IODebugContext dbg; - target_->PrepareWrite(offset, len, io_opts, &dbg); - } - - Status Allocate(uint64_t offset, uint64_t len) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Allocate(offset, len, io_opts, &dbg); - } - - std::unique_ptr* target() { return &target_; } - - private: - std::unique_ptr target_; -}; - -class CompositeRandomRWFileWrapper : public RandomRWFile { - public: - explicit CompositeRandomRWFileWrapper(std::unique_ptr& target) - : target_(std::move(target)) {} - - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - Status Write(uint64_t offset, const Slice& data) override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Write(offset, data, io_opts, &dbg); - } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Read(offset, n, io_opts, result, scratch, &dbg); - } - Status Flush() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Flush(io_opts, &dbg); - } - Status Sync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Sync(io_opts, &dbg); - } - Status Fsync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); - } - Status Close() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Close(io_opts, &dbg); - } - - private: - std::unique_ptr target_; -}; - -class CompositeDirectoryWrapper : public Directory { - public: - explicit CompositeDirectoryWrapper(std::unique_ptr& target) - : target_(std::move(target)) {} - - Status Fsync() override { - IOOptions io_opts; - IODebugContext dbg; - return target_->Fsync(io_opts, &dbg); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - } +#ifdef _WIN32 +// Windows API macro interference +#undef DeleteFile +#undef GetCurrentTime +#undef LoadLibrary +#endif - private: - std::unique_ptr target_; -}; +namespace ROCKSDB_NAMESPACE { -class CompositeEnvWrapper : public Env { +class CompositeEnv : public Env { public: // Initialize a CompositeEnvWrapper that delegates all thread/time related // calls to env, and all file operations to fs - explicit CompositeEnvWrapper(Env* env, std::shared_ptr fs) - : Env(fs), env_target_(env) {} - ~CompositeEnvWrapper() {} - - // Return the target to which this Env forwards all calls - Env* env_target() const { return env_target_; } + explicit CompositeEnv(const std::shared_ptr& fs, + const std::shared_ptr& clock) + : Env(fs, clock) {} Status RegisterDbPaths(const std::vector& paths) override { return file_system_->RegisterDbPaths(paths); @@ -292,99 +36,37 @@ class CompositeEnvWrapper : public Env { // The following text is boilerplate that forwards all methods to target() Status NewSequentialFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = - file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeSequentialFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = - file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeRandomAccessFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewWritableFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = - file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeWritableFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status ReopenWritableFile(const std::string& fname, std::unique_ptr* result, - const EnvOptions& options) override { - IODebugContext dbg; - Status status; - std::unique_ptr file; - status = file_system_->ReopenWritableFile(fname, FileOptions(options), - &file, &dbg); - if (status.ok()) { - result->reset(new CompositeWritableFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, const std::string& old_fname, std::unique_ptr* r, - const EnvOptions& options) override { - IODebugContext dbg; - Status status; - std::unique_ptr file; - status = file_system_->ReuseWritableFile(fname, old_fname, - FileOptions(options), &file, &dbg); - if (status.ok()) { - r->reset(new CompositeWritableFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewRandomRWFile(const std::string& fname, std::unique_ptr* result, - const EnvOptions& options) override { - IODebugContext dbg; - std::unique_ptr file; - Status status; - status = - file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg); - if (status.ok()) { - result->reset(new CompositeRandomRWFileWrapper(file)); - } - return status; - } + const EnvOptions& options) override; + Status NewMemoryMappedFileBuffer( const std::string& fname, std::unique_ptr* result) override { return file_system_->NewMemoryMappedFileBuffer(fname, result); } + Status NewDirectory(const std::string& name, - std::unique_ptr* result) override { - IOOptions io_opts; - IODebugContext dbg; - std::unique_ptr dir; - Status status; - status = file_system_->NewDirectory(name, io_opts, &dir, &dbg); - if (status.ok()) { - result->reset(new CompositeDirectoryWrapper(dir)); - } - return status; - } + std::unique_ptr* result) override; + Status FileExists(const std::string& f) override { IOOptions io_opts; IODebugContext dbg; @@ -498,109 +180,32 @@ class CompositeEnvWrapper : public Env { return file_system_->IsDirectory(path, io_opts, is_dir, &dbg); } -#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) - Status LoadLibrary(const std::string& lib_name, - const std::string& search_path, - std::shared_ptr* result) override { - return env_target_->LoadLibrary(lib_name, search_path, result); - } -#endif - - void Schedule(void (*f)(void* arg), void* a, Priority pri, - void* tag = nullptr, void (*u)(void* arg) = nullptr) override { - return env_target_->Schedule(f, a, pri, tag, u); - } - - int UnSchedule(void* tag, Priority pri) override { - return env_target_->UnSchedule(tag, pri); - } - - void StartThread(void (*f)(void*), void* a) override { - return env_target_->StartThread(f, a); - } - void WaitForJoin() override { return env_target_->WaitForJoin(); } - unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { - return env_target_->GetThreadPoolQueueLen(pri); - } Status GetTestDirectory(std::string* path) override { IOOptions io_opts; IODebugContext dbg; return file_system_->GetTestDirectory(io_opts, path, &dbg); } - uint64_t NowMicros() override { return env_target_->NowMicros(); } - uint64_t NowNanos() override { return env_target_->NowNanos(); } - uint64_t NowCPUNanos() override { return env_target_->NowCPUNanos(); } - - void SleepForMicroseconds(int micros) override { - env_target_->SleepForMicroseconds(micros); - } - Status GetHostName(char* name, uint64_t len) override { - return env_target_->GetHostName(name, len); - } - Status GetCurrentTime(int64_t* unix_time) override { - return env_target_->GetCurrentTime(unix_time); - } - void SetBackgroundThreads(int num, Priority pri) override { - return env_target_->SetBackgroundThreads(num, pri); - } - int GetBackgroundThreads(Priority pri) override { - return env_target_->GetBackgroundThreads(pri); - } - - Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { - return env_target_->SetAllowNonOwnerAccess(allow_non_owner_access); - } - - void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { - return env_target_->IncBackgroundThreadsIfNeeded(num, pri); - } - - void LowerThreadPoolIOPriority(Priority pool) override { - env_target_->LowerThreadPoolIOPriority(pool); - } - - void LowerThreadPoolCPUPriority(Priority pool) override { - env_target_->LowerThreadPoolCPUPriority(pool); - } - - Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { - return env_target_->LowerThreadPoolCPUPriority(pool, pri); - } - - std::string TimeToString(uint64_t time) override { - return env_target_->TimeToString(time); - } - - Status GetThreadList(std::vector* thread_list) override { - return env_target_->GetThreadList(thread_list); - } - - ThreadStatusUpdater* GetThreadStatusUpdater() const override { - return env_target_->GetThreadStatusUpdater(); - } - - uint64_t GetThreadID() const override { return env_target_->GetThreadID(); } - - std::string GenerateUniqueId() override { - return env_target_->GenerateUniqueId(); - } EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override { return file_system_->OptimizeForLogRead(FileOptions(env_options)); } + EnvOptions OptimizeForManifestRead( const EnvOptions& env_options) const override { return file_system_->OptimizeForManifestRead(FileOptions(env_options)); } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, const DBOptions& db_options) const override { return file_system_->OptimizeForLogWrite(FileOptions(env_options), db_options); } + EnvOptions OptimizeForManifestWrite( const EnvOptions& env_options) const override { return file_system_->OptimizeForManifestWrite(FileOptions(env_options)); } + EnvOptions OptimizeForCompactionTableWrite( const EnvOptions& env_options, const ImmutableDBOptions& immutable_ops) const override { @@ -613,7 +218,12 @@ class CompositeEnvWrapper : public Env { return file_system_->OptimizeForCompactionTableRead( FileOptions(env_options), db_options); } - + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForBlobFileRead(FileOptions(env_options), + db_options); + } // This seems to clash with a macro on Windows, so #undef it here #ifdef GetFreeSpace #undef GetFreeSpace @@ -623,522 +233,113 @@ class CompositeEnvWrapper : public Env { IODebugContext dbg; return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg); } + uint64_t NowMicros() override { return system_clock_->NowMicros(); } + uint64_t NowNanos() override { return system_clock_->NowNanos(); } - private: - Env* env_target_; -}; - -class LegacySequentialFileWrapper : public FSSequentialFile { - public: - explicit LegacySequentialFileWrapper( - std::unique_ptr&& _target) - : target_(std::move(_target)) {} - - IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result, - char* scratch, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Read(n, result, scratch)); - } - IOStatus Skip(uint64_t n) override { - return status_to_io_status(target_->Skip(n)); - } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - IOStatus InvalidateCache(size_t offset, size_t length) override { - return status_to_io_status(target_->InvalidateCache(offset, length)); - } - IOStatus PositionedRead(uint64_t offset, size_t n, - const IOOptions& /*options*/, Slice* result, - char* scratch, IODebugContext* /*dbg*/) override { - return status_to_io_status( - target_->PositionedRead(offset, n, result, scratch)); - } - SequentialFile* target() { return target_.get(); } - - private: - std::unique_ptr target_; -}; + uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); } -class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { - public: - explicit LegacyRandomAccessFileWrapper( - std::unique_ptr&& target) - : target_(std::move(target)) {} - - IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, - Slice* result, char* scratch, - IODebugContext* /*dbg*/) const override { - return status_to_io_status(target_->Read(offset, n, result, scratch)); + void SleepForMicroseconds(int micros) override { + system_clock_->SleepForMicroseconds(micros); } - IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - std::vector reqs; - Status status; - - reqs.reserve(num_reqs); - for (size_t i = 0; i < num_reqs; ++i) { - ReadRequest req; - req.offset = fs_reqs[i].offset; - req.len = fs_reqs[i].len; - req.scratch = fs_reqs[i].scratch; - req.status = Status::OK(); - - reqs.emplace_back(req); - } - status = target_->MultiRead(reqs.data(), num_reqs); - for (size_t i = 0; i < num_reqs; ++i) { - fs_reqs[i].result = reqs[i].result; - fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status)); - } - return status_to_io_status(std::move(status)); - ; - } - IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Prefetch(offset, n)); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - }; - void Hint(AccessPattern pattern) override { - target_->Hint((RandomAccessFile::AccessPattern)pattern); - } - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); + Status GetCurrentTime(int64_t* unix_time) override { + return system_clock_->GetCurrentTime(unix_time); } - IOStatus InvalidateCache(size_t offset, size_t length) override { - return status_to_io_status(target_->InvalidateCache(offset, length)); + std::string TimeToString(uint64_t time) override { + return system_clock_->TimeToString(time); } - - private: - std::unique_ptr target_; }; -class LegacyWritableFileWrapper : public FSWritableFile { +class CompositeEnvWrapper : public CompositeEnv { public: - explicit LegacyWritableFileWrapper(std::unique_ptr&& _target) - : target_(std::move(_target)) {} - - IOStatus Append(const Slice& data, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Append(data)); - } - IOStatus Append(const Slice& data, const IOOptions& /*options*/, - const DataVerificationInfo& /*verification_info*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Append(data)); - } - IOStatus PositionedAppend(const Slice& data, uint64_t offset, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->PositionedAppend(data, offset)); - } - IOStatus PositionedAppend(const Slice& data, uint64_t offset, - const IOOptions& /*options*/, - const DataVerificationInfo& /*verification_info*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->PositionedAppend(data, offset)); - } - IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Truncate(size)); - } - IOStatus Close(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Close()); - } - IOStatus Flush(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Flush()); - } - IOStatus Sync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Sync()); - } - IOStatus Fsync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Fsync()); - } - bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } - - bool use_direct_io() const override { return target_->use_direct_io(); } - - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - - void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { - target_->SetWriteLifeTimeHint(hint); - } - - Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { - return target_->GetWriteLifeTimeHint(); - } - - uint64_t GetFileSize(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return target_->GetFileSize(); - } - - void SetPreallocationBlockSize(size_t size) override { - target_->SetPreallocationBlockSize(size); - } - - void GetPreallocationStatus(size_t* block_size, - size_t* last_allocated_block) override { - target_->GetPreallocationStatus(block_size, last_allocated_block); - } - - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); - } + // Initialize a CompositeEnvWrapper that delegates all thread/time related + // calls to env, and all file operations to fs + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& fs) + : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {} - IOStatus InvalidateCache(size_t offset, size_t length) override { - return status_to_io_status(target_->InvalidateCache(offset, length)); - } + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& sc) + : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {} - IOStatus RangeSync(uint64_t offset, uint64_t nbytes, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->RangeSync(offset, nbytes)); - } + explicit CompositeEnvWrapper(Env* env, const std::shared_ptr& fs, + const std::shared_ptr& sc) + : CompositeEnv(fs, sc), env_target_(env) {} - void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - target_->PrepareWrite(offset, len); - } + // Return the target to which this Env forwards all calls + Env* env_target() const { return env_target_; } - IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Allocate(offset, len)); +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return env_target_->LoadLibrary(lib_name, search_path, result); } +#endif - WritableFile* target() { return target_.get(); } - - private: - std::unique_ptr target_; -}; - -class LegacyRandomRWFileWrapper : public FSRandomRWFile { - public: - explicit LegacyRandomRWFileWrapper(std::unique_ptr&& target) - : target_(std::move(target)) {} - - bool use_direct_io() const override { return target_->use_direct_io(); } - size_t GetRequiredBufferAlignment() const override { - return target_->GetRequiredBufferAlignment(); - } - IOStatus Write(uint64_t offset, const Slice& data, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Write(offset, data)); - } - IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, - Slice* result, char* scratch, - IODebugContext* /*dbg*/) const override { - return status_to_io_status(target_->Read(offset, n, result, scratch)); - } - IOStatus Flush(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Flush()); - } - IOStatus Sync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Sync()); - } - IOStatus Fsync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Fsync()); - } - IOStatus Close(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Close()); + void Schedule(void (*f)(void* arg), void* a, Priority pri, + void* tag = nullptr, void (*u)(void* arg) = nullptr) override { + return env_target_->Schedule(f, a, pri, tag, u); } - private: - std::unique_ptr target_; -}; - -class LegacyDirectoryWrapper : public FSDirectory { - public: - explicit LegacyDirectoryWrapper(std::unique_ptr&& target) - : target_(std::move(target)) {} - - IOStatus Fsync(const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Fsync()); - } - size_t GetUniqueId(char* id, size_t max_size) const override { - return target_->GetUniqueId(id, max_size); + int UnSchedule(void* tag, Priority pri) override { + return env_target_->UnSchedule(tag, pri); } - private: - std::unique_ptr target_; -}; - -class LegacyFileSystemWrapper : public FileSystem { - public: - // Initialize an EnvWrapper that delegates all calls to *t - explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} - ~LegacyFileSystemWrapper() override {} - - const char* Name() const override { return "Legacy File System"; } - - // Return the target to which this Env forwards all calls - Env* target() const { return target_; } - - // The following text is boilerplate that forwards all methods to target() - IOStatus NewSequentialFile(const std::string& f, - const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewSequentialFile(f, &file, file_opts); - if (s.ok()) { - r->reset(new LegacySequentialFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewRandomAccessFile(const std::string& f, - const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewRandomAccessFile(f, &file, file_opts); - if (s.ok()) { - r->reset(new LegacyRandomAccessFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewWritableFile(f, &file, file_opts); - if (s.ok()) { - r->reset(new LegacyWritableFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus ReopenWritableFile(const std::string& fname, - const FileOptions& file_opts, - std::unique_ptr* result, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->ReopenWritableFile(fname, &file, file_opts); - if (s.ok()) { - result->reset(new LegacyWritableFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - const FileOptions& file_opts, - std::unique_ptr* r, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts); - if (s.ok()) { - r->reset(new LegacyWritableFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewRandomRWFile(const std::string& fname, - const FileOptions& file_opts, - std::unique_ptr* result, - IODebugContext* /*dbg*/) override { - std::unique_ptr file; - Status s = target_->NewRandomRWFile(fname, &file, file_opts); - if (s.ok()) { - result->reset(new LegacyRandomRWFileWrapper(std::move(file))); - } - return status_to_io_status(std::move(s)); - } - IOStatus NewMemoryMappedFileBuffer( - const std::string& fname, - std::unique_ptr* result) override { - return status_to_io_status( - target_->NewMemoryMappedFileBuffer(fname, result)); - } - IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/, - std::unique_ptr* result, - IODebugContext* /*dbg*/) override { - std::unique_ptr dir; - Status s = target_->NewDirectory(name, &dir); - if (s.ok()) { - result->reset(new LegacyDirectoryWrapper(std::move(dir))); - } - return status_to_io_status(std::move(s)); - } - IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->FileExists(f)); - } - IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/, - std::vector* r, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetChildren(dir, r)); - } - IOStatus GetChildrenFileAttributes(const std::string& dir, - const IOOptions& /*options*/, - std::vector* result, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetChildrenFileAttributes(dir, result)); - } - IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->DeleteFile(f)); - } - IOStatus Truncate(const std::string& fname, size_t size, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->Truncate(fname, size)); - } - IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->CreateDir(d)); - } - IOStatus CreateDirIfMissing(const std::string& d, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->CreateDirIfMissing(d)); - } - IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->DeleteDir(d)); + void StartThread(void (*f)(void*), void* a) override { + return env_target_->StartThread(f, a); } - IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, - uint64_t* s, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetFileSize(f, s)); + void WaitForJoin() override { return env_target_->WaitForJoin(); } + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override { + return env_target_->GetThreadPoolQueueLen(pri); } - IOStatus GetFileModificationTime(const std::string& fname, - const IOOptions& /*options*/, - uint64_t* file_mtime, - IODebugContext* /*dbg*/) override { - return status_to_io_status( - target_->GetFileModificationTime(fname, file_mtime)); + Status GetHostName(char* name, uint64_t len) override { + return env_target_->GetHostName(name, len); } - - IOStatus GetAbsolutePath(const std::string& db_path, - const IOOptions& /*options*/, - std::string* output_path, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetAbsolutePath(db_path, output_path)); + void SetBackgroundThreads(int num, Priority pri) override { + return env_target_->SetBackgroundThreads(num, pri); } - - IOStatus RenameFile(const std::string& s, const std::string& t, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->RenameFile(s, t)); + int GetBackgroundThreads(Priority pri) override { + return env_target_->GetBackgroundThreads(pri); } - IOStatus LinkFile(const std::string& s, const std::string& t, - const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->LinkFile(s, t)); + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + return env_target_->SetAllowNonOwnerAccess(allow_non_owner_access); } - IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/, - uint64_t* count, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->NumFileLinks(fname, count)); + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + return env_target_->IncBackgroundThreadsIfNeeded(num, pri); } - IOStatus AreFilesSame(const std::string& first, const std::string& second, - const IOOptions& /*options*/, bool* res, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->AreFilesSame(first, second, res)); + void LowerThreadPoolIOPriority(Priority pool) override { + env_target_->LowerThreadPoolIOPriority(pool); } - IOStatus LockFile(const std::string& f, const IOOptions& /*options*/, - FileLock** l, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->LockFile(f, l)); + void LowerThreadPoolCPUPriority(Priority pool) override { + env_target_->LowerThreadPoolCPUPriority(pool); } - IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->UnlockFile(l)); + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + return env_target_->LowerThreadPoolCPUPriority(pool, pri); } - IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetTestDirectory(path)); - } - IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/, - std::shared_ptr* result, - IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->NewLogger(fname, result)); + Status GetThreadList(std::vector* thread_list) override { + return env_target_->GetThreadList(thread_list); } - void SanitizeFileOptions(FileOptions* opts) const override { - target_->SanitizeEnvOptions(opts); + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return env_target_->GetThreadStatusUpdater(); } - FileOptions OptimizeForLogRead( - const FileOptions& file_options) const override { - return target_->OptimizeForLogRead(file_options); - } - FileOptions OptimizeForManifestRead( - const FileOptions& file_options) const override { - return target_->OptimizeForManifestRead(file_options); - } - FileOptions OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const override { - return target_->OptimizeForLogWrite(file_options, db_options); - } - FileOptions OptimizeForManifestWrite( - const FileOptions& file_options) const override { - return target_->OptimizeForManifestWrite(file_options); - } - FileOptions OptimizeForCompactionTableWrite( - const FileOptions& file_options, - const ImmutableDBOptions& immutable_ops) const override { - return target_->OptimizeForCompactionTableWrite(file_options, - immutable_ops); - } - FileOptions OptimizeForCompactionTableRead( - const FileOptions& file_options, - const ImmutableDBOptions& db_options) const override { - return target_->OptimizeForCompactionTableRead(file_options, db_options); - } + uint64_t GetThreadID() const override { return env_target_->GetThreadID(); } -// This seems to clash with a macro on Windows, so #undef it here -#ifdef GetFreeSpace -#undef GetFreeSpace -#endif - IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/, - uint64_t* diskfree, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->GetFreeSpace(path, diskfree)); - } - IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/, - bool* is_dir, IODebugContext* /*dbg*/) override { - return status_to_io_status(target_->IsDirectory(path, is_dir)); + std::string GenerateUniqueId() override { + return env_target_->GenerateUniqueId(); } private: - Env* target_; + Env* env_target_; }; -inline std::unique_ptr NewLegacySequentialFileWrapper( - std::unique_ptr& file) { - return std::unique_ptr( - new LegacySequentialFileWrapper(std::move(file))); -} - -inline std::unique_ptr NewLegacyRandomAccessFileWrapper( - std::unique_ptr& file) { - return std::unique_ptr( - new LegacyRandomAccessFileWrapper(std::move(file))); -} - -inline std::unique_ptr NewLegacyWritableFileWrapper( - std::unique_ptr&& file) { - return std::unique_ptr( - new LegacyWritableFileWrapper(std::move(file))); -} - +std::unique_ptr NewLegacySequentialFileWrapper( + std::unique_ptr& file); } // namespace ROCKSDB_NAMESPACE diff --git a/env/env.cc b/env/env.cc index 829fcefb15d..000a52575ef 100644 --- a/env/env.cc +++ b/env/env.cc @@ -10,25 +10,569 @@ #include "rocksdb/env.h" #include + #include "env/composite_env_wrapper.h" #include "logging/env_logger.h" #include "memory/arena.h" #include "options/db_options.h" #include "port/port.h" -#include "port/sys_time.h" +#include "rocksdb/convenience.h" #include "rocksdb/options.h" +#include "rocksdb/system_clock.h" #include "rocksdb/utilities/object_registry.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { +namespace { +class LegacySystemClock : public SystemClock { + private: + Env* env_; + + public: + explicit LegacySystemClock(Env* env) : env_(env) {} + const char* Name() const override { return "Legacy System Clock"; } + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + uint64_t NowMicros() override { return env_->NowMicros(); } + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + uint64_t NowNanos() override { return env_->NowNanos(); } + + uint64_t CPUMicros() override { return CPUNanos() / 1000; } + uint64_t CPUNanos() override { return env_->NowCPUNanos(); } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + void SleepForMicroseconds(int micros) override { + env_->SleepForMicroseconds(micros); + } + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + Status GetCurrentTime(int64_t* unix_time) override { + return env_->GetCurrentTime(unix_time); + } + // Converts seconds-since-Jan-01-1970 to a printable string + std::string TimeToString(uint64_t time) override { + return env_->TimeToString(time); + } +}; + +class LegacySequentialFileWrapper : public FSSequentialFile { + public: + explicit LegacySequentialFileWrapper( + std::unique_ptr&& _target) + : target_(std::move(_target)) {} + + IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Read(n, result, scratch)); + } + IOStatus Skip(uint64_t n) override { + return status_to_io_status(target_->Skip(n)); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + IOStatus PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->PositionedRead(offset, n, result, scratch)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { + public: + explicit LegacyRandomAccessFileWrapper( + std::unique_ptr&& target) + : target_(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + + IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + std::vector reqs; + Status status; + + reqs.reserve(num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest req; + + req.offset = fs_reqs[i].offset; + req.len = fs_reqs[i].len; + req.scratch = fs_reqs[i].scratch; + req.status = Status::OK(); + + reqs.emplace_back(req); + } + status = target_->MultiRead(reqs.data(), num_reqs); + for (size_t i = 0; i < num_reqs; ++i) { + fs_reqs[i].result = reqs[i].result; + fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status)); + } + return status_to_io_status(std::move(status)); + } + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Prefetch(offset, n)); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + void Hint(AccessPattern pattern) override { + target_->Hint((RandomAccessFile::AccessPattern)pattern); + } + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyRandomRWFileWrapper : public FSRandomRWFile { + public: + explicit LegacyRandomRWFileWrapper(std::unique_ptr&& target) + : target_(std::move(target)) {} + + bool use_direct_io() const override { return target_->use_direct_io(); } + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Write(offset, data)); + } + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { + return status_to_io_status(target_->Read(offset, n, result, scratch)); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + + private: + std::unique_ptr target_; +}; + +class LegacyWritableFileWrapper : public FSWritableFile { + public: + explicit LegacyWritableFileWrapper(std::unique_ptr&& _target) + : target_(std::move(_target)) {} + + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Append(data)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + const DataVerificationInfo& /*verification_info*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->PositionedAppend(data, offset)); + } + IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(size)); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Close()); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Flush()); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Sync()); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); } + + bool use_direct_io() const override { return target_->use_direct_io(); } + + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + + void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override { + target_->SetWriteLifeTimeHint(hint); + } + + Env::WriteLifeTimeHint GetWriteLifeTimeHint() override { + return target_->GetWriteLifeTimeHint(); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return target_->GetFileSize(); + } + + void SetPreallocationBlockSize(size_t size) override { + target_->SetPreallocationBlockSize(size); + } + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override { + target_->GetPreallocationStatus(block_size, last_allocated_block); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + IOStatus InvalidateCache(size_t offset, size_t length) override { + return status_to_io_status(target_->InvalidateCache(offset, length)); + } + + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RangeSync(offset, nbytes)); + } + + void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + target_->PrepareWrite(offset, len); + } + + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Allocate(offset, len)); + } + + private: + std::unique_ptr target_; +}; + +class LegacyDirectoryWrapper : public FSDirectory { + public: + explicit LegacyDirectoryWrapper(std::unique_ptr&& target) + : target_(std::move(target)) {} + + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Fsync()); + } + size_t GetUniqueId(char* id, size_t max_size) const override { + return target_->GetUniqueId(id, max_size); + } + + private: + std::unique_ptr target_; +}; + +class LegacyFileSystemWrapper : public FileSystem { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit LegacyFileSystemWrapper(Env* t) : target_(t) {} + ~LegacyFileSystemWrapper() override {} + + const char* Name() const override { return "Legacy File System"; } + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewSequentialFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacySequentialFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewRandomAccessFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyRandomAccessFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewWritableFile(f, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->ReopenWritableFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts); + if (s.ok()) { + r->reset(new LegacyWritableFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr file; + Status s = target_->NewRandomRWFile(fname, &file, file_opts); + if (s.ok()) { + result->reset(new LegacyRandomRWFileWrapper(std::move(file))); + } + return status_to_io_status(std::move(s)); + } + IOStatus NewMemoryMappedFileBuffer( + const std::string& fname, + std::unique_ptr* result) override { + return status_to_io_status( + target_->NewMemoryMappedFileBuffer(fname, result)); + } + IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override { + std::unique_ptr dir; + Status s = target_->NewDirectory(name, &dir); + if (s.ok()) { + result->reset(new LegacyDirectoryWrapper(std::move(dir))); + } + return status_to_io_status(std::move(s)); + } + IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->FileExists(f)); + } + IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/, + std::vector* r, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildren(dir, r)); + } + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& /*options*/, + std::vector* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetChildrenFileAttributes(dir, result)); + } + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteFile(f)); + } + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->Truncate(fname, size)); + } + IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDir(d)); + } + IOStatus CreateDirIfMissing(const std::string& d, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->CreateDirIfMissing(d)); + } + IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->DeleteDir(d)); + } + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFileSize(f, s)); + } + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) override { + return status_to_io_status( + target_->GetFileModificationTime(fname, file_mtime)); + } + + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetAbsolutePath(db_path, output_path)); + } + + IOStatus RenameFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->RenameFile(s, t)); + } + + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LinkFile(s, t)); + } + + IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/, + uint64_t* count, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NumFileLinks(fname, count)); + } + + IOStatus AreFilesSame(const std::string& first, const std::string& second, + const IOOptions& /*options*/, bool* res, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->AreFilesSame(first, second, res)); + } + + IOStatus LockFile(const std::string& f, const IOOptions& /*options*/, + FileLock** l, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->LockFile(f, l)); + } + + IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->UnlockFile(l)); + } + + IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetTestDirectory(path)); + } + IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/, + std::shared_ptr* result, + IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->NewLogger(fname, result)); + } + + void SanitizeFileOptions(FileOptions* opts) const override { + target_->SanitizeEnvOptions(opts); + } + + FileOptions OptimizeForLogRead( + const FileOptions& file_options) const override { + return target_->OptimizeForLogRead(file_options); + } + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestRead(file_options); + } + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override { + return target_->OptimizeForLogWrite(file_options, db_options); + } + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override { + return target_->OptimizeForManifestWrite(file_options); + } + FileOptions OptimizeForCompactionTableWrite( + const FileOptions& file_options, + const ImmutableDBOptions& immutable_ops) const override { + return target_->OptimizeForCompactionTableWrite(file_options, + immutable_ops); + } + FileOptions OptimizeForCompactionTableRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForCompactionTableRead(file_options, db_options); + } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } + +#ifdef GetFreeSpace +#undef GetFreeSpace +#endif + IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/, + uint64_t* diskfree, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->GetFreeSpace(path, diskfree)); + } + IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/, + bool* is_dir, IODebugContext* /*dbg*/) override { + return status_to_io_status(target_->IsDirectory(path, is_dir)); + } + + private: + Env* target_; +}; +} // end anonymous namespace Env::Env() : thread_status_updater_(nullptr) { file_system_ = std::make_shared(this); + system_clock_ = std::make_shared(this); } -Env::Env(std::shared_ptr fs) - : thread_status_updater_(nullptr), - file_system_(fs) {} +Env::Env(const std::shared_ptr& fs) + : thread_status_updater_(nullptr), file_system_(fs) { + system_clock_ = std::make_shared(this); +} + +Env::Env(const std::shared_ptr& fs, + const std::shared_ptr& clock) + : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {} Env::~Env() { } @@ -39,11 +583,18 @@ Status Env::NewLogger(const std::string& fname, } Status Env::LoadEnv(const std::string& value, Env** result) { + return CreateFromString(ConfigOptions(), value, result); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result) { Env* env = *result; Status s; #ifndef ROCKSDB_LITE + (void)config_options; s = ObjectRegistry::NewInstance()->NewStaticObject(value, &env); #else + (void)config_options; s = Status::NotSupported("Cannot load environment in LITE mode", value); #endif if (s.ok()) { @@ -54,18 +605,29 @@ Status Env::LoadEnv(const std::string& value, Env** result) { Status Env::LoadEnv(const std::string& value, Env** result, std::shared_ptr* guard) { + return CreateFromString(ConfigOptions(), value, result, guard); +} + +Status Env::CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard) { assert(result); + if (value.empty()) { + *result = Env::Default(); + return Status::OK(); + } Status s; #ifndef ROCKSDB_LITE Env* env = nullptr; std::unique_ptr uniq_guard; std::string err_msg; assert(guard != nullptr); + (void)config_options; env = ObjectRegistry::NewInstance()->NewObject(value, &uniq_guard, &err_msg); if (!env) { - s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " + - value); + s = Status::NotSupported(std::string("Cannot load ") + Env::Type() + ": " + + value); env = Env::Default(); } if (s.ok() && uniq_guard) { @@ -75,6 +637,7 @@ Status Env::LoadEnv(const std::string& value, Env** result, *result = env; } #else + (void)config_options; (void)result; (void)guard; s = Status::NotSupported("Cannot load environment in LITE mode", value); @@ -82,6 +645,30 @@ Status Env::LoadEnv(const std::string& value, Env** result, return s; } +Status Env::CreateFromUri(const ConfigOptions& config_options, + const std::string& env_uri, const std::string& fs_uri, + Env** result, std::shared_ptr* guard) { + *result = config_options.env; + if (env_uri.empty() && fs_uri.empty()) { + // Neither specified. Use the default + guard->reset(); + return Status::OK(); + } else if (!env_uri.empty() && !fs_uri.empty()) { + // Both specified. Cannot choose. Return Invalid + return Status::InvalidArgument("cannot specify both fs_uri and env_uri"); + } else if (fs_uri.empty()) { // Only have an ENV URI. Create an Env from it + return CreateFromString(config_options, env_uri, result, guard); + } else { + std::shared_ptr fs; + Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs); + if (s.ok()) { + guard->reset(new CompositeEnvWrapper(*result, fs)); + *result = guard->get(); + } + return s; + } +} + std::string Env::PriorityToString(Env::Priority priority) { switch (priority) { case Env::Priority::BOTTOM: @@ -140,6 +727,16 @@ Status Env::GetChildrenFileAttributes(const std::string& dir, return Status::OK(); } +Status Env::GetHostNameString(std::string* result) { + std::array hostname_buf; + Status s = GetHostName(hostname_buf.data(), hostname_buf.size()); + if (s.ok()) { + hostname_buf[hostname_buf.size() - 1] = '\0'; + result->assign(hostname_buf.data()); + } + return s; +} + SequentialFile::~SequentialFile() { } @@ -377,13 +974,13 @@ void Log(const std::shared_ptr& info_log, const char* format, ...) { Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname, bool should_sync) { - LegacyFileSystemWrapper lfsw(env); - return WriteStringToFile(&lfsw, data, fname, should_sync); + const auto& fs = env->GetFileSystem(); + return WriteStringToFile(fs.get(), data, fname, should_sync); } Status ReadFileToString(Env* env, const std::string& fname, std::string* data) { - LegacyFileSystemWrapper lfsw(env); - return ReadFileToString(&lfsw, fname, data); + const auto& fs = env->GetFileSystem(); + return ReadFileToString(fs.get(), fname, data); } EnvWrapper::~EnvWrapper() { @@ -449,6 +1046,12 @@ EnvOptions Env::OptimizeForCompactionTableRead( optimized_env_options.use_direct_reads = db_options.use_direct_reads; return optimized_env_options; } +EnvOptions Env::OptimizeForBlobFileRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} EnvOptions::EnvOptions(const DBOptions& options) { AssignEnvOptions(this, options); @@ -461,18 +1064,18 @@ EnvOptions::EnvOptions() { Status NewEnvLogger(const std::string& fname, Env* env, std::shared_ptr* result) { - EnvOptions options; + FileOptions options; // TODO: Tune the buffer size. options.writable_file_max_buffer_size = 1024 * 1024; - std::unique_ptr writable_file; - const auto status = env->NewWritableFile(fname, &writable_file, options); + std::unique_ptr writable_file; + const auto status = env->GetFileSystem()->NewWritableFile( + fname, options, &writable_file, nullptr); if (!status.ok()) { return status; } - *result = std::make_shared( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, options, - env); + *result = std::make_shared(std::move(writable_file), fname, + options, env); return Status::OK(); } @@ -480,10 +1083,14 @@ const std::shared_ptr& Env::GetFileSystem() const { return file_system_; } -#ifdef OS_WIN -std::unique_ptr NewCompositeEnv(std::shared_ptr fs) { - return std::unique_ptr(new CompositeEnvWrapper(Env::Default(), fs)); +const std::shared_ptr& Env::GetSystemClock() const { + return system_clock_; +} + +std::unique_ptr NewLegacySequentialFileWrapper( + std::unique_ptr& file) { + return std::unique_ptr( + new LegacySequentialFileWrapper(std::move(file))); } -#endif } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index cc91e10eb99..e8e3df5f65a 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -10,6 +10,7 @@ #include #include "env/mock_env.h" +#include "file/file_util.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/env_encryption.h" @@ -17,46 +18,6 @@ namespace ROCKSDB_NAMESPACE { -// Normalizes trivial differences across Envs such that these test cases can -// run on all Envs. -class NormalizingEnvWrapper : public EnvWrapper { - private: - std::unique_ptr base_; - - public: - explicit NormalizingEnvWrapper(std::unique_ptr&& base) - : EnvWrapper(base.get()), base_(std::move(base)) {} - explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {} - - // Removes . and .. from directory listing - Status GetChildren(const std::string& dir, - std::vector* result) override { - Status status = EnvWrapper::GetChildren(dir, result); - if (status.ok()) { - result->erase(std::remove_if(result->begin(), result->end(), - [](const std::string& s) { - return s == "." || s == ".."; - }), - result->end()); - } - return status; - } - - // Removes . and .. from directory listing - Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override { - Status status = EnvWrapper::GetChildrenFileAttributes(dir, result); - if (status.ok()) { - result->erase(std::remove_if(result->begin(), result->end(), - [](const FileAttributes& fa) { - return fa.name == "." || fa.name == ".."; - }), - result->end()); - } - return status; - } -}; - class EnvBasicTestWithParam : public testing::Test, public ::testing::WithParamInterface { public: @@ -68,32 +29,17 @@ class EnvBasicTestWithParam : public testing::Test, test_dir_ = test::PerThreadDBPath(env_, "env_basic_test"); } - void SetUp() override { - env_->CreateDirIfMissing(test_dir_).PermitUncheckedError(); - } + void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); } - void TearDown() override { - std::vector files; - env_->GetChildren(test_dir_, &files).PermitUncheckedError(); - for (const auto& file : files) { - // don't know whether it's file or directory, try both. The tests must - // only create files or empty directories, so one must succeed, else the - // directory's corrupted. - Status s = env_->DeleteFile(test_dir_ + "/" + file); - if (!s.ok()) { - ASSERT_OK(env_->DeleteDir(test_dir_ + "/" + file)); - } - } - } + void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); } }; class EnvMoreTestWithParam : public EnvBasicTestWithParam {}; -static std::unique_ptr def_env(new NormalizingEnvWrapper(Env::Default())); INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam, - ::testing::Values(def_env.get())); + ::testing::Values(Env::Default())); INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam, - ::testing::Values(def_env.get())); + ::testing::Values(Env::Default())); static std::unique_ptr mock_env(new MockEnv(Env::Default())); INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, @@ -104,8 +50,7 @@ static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) { std::shared_ptr provider; EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id, &provider)); - std::unique_ptr encrypted(NewEncryptedEnv(base, provider)); - return new NormalizingEnvWrapper(std::move(encrypted)); + return NewEncryptedEnv(base, provider); } // next statements run env test against default encryption code. @@ -130,19 +75,30 @@ namespace { // The purpose of returning an empty vector (instead of nullptr) is that gtest // ValuesIn() will skip running tests when given an empty collection. std::vector GetCustomEnvs() { - static Env* custom_env; static bool init = false; + static std::vector res; if (!init) { init = true; const char* uri = getenv("TEST_ENV_URI"); if (uri != nullptr) { - Env::LoadEnv(uri, &custom_env); + static std::shared_ptr env_guard; + static Env* custom_env; + Status s = + Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env, &env_guard); + if (s.ok()) { + res.emplace_back(custom_env); + } + } + uri = getenv("TEST_FS_URI"); + if (uri != nullptr) { + static std::shared_ptr fs_env_guard; + static Env* fs_env; + Status s = + Env::CreateFromUri(ConfigOptions(), "", uri, &fs_env, &fs_env_guard); + if (s.ok()) { + res.emplace_back(fs_env); + } } - } - - std::vector res; - if (custom_env != nullptr) { - res.emplace_back(custom_env); } return res; } @@ -228,8 +184,8 @@ TEST_P(EnvBasicTestWithParam, Basics) { ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g")); ASSERT_OK(env_->GetChildren(test_dir_, &children)); ASSERT_EQ(0U, children.size()); - ASSERT_TRUE( - env_->GetChildren(test_dir_ + "/non_existent", &children).IsNotFound()); + Status s = env_->GetChildren(test_dir_ + "/non_existent", &children); + ASSERT_TRUE(s.IsNotFound()); } TEST_P(EnvBasicTestWithParam, ReadWrite) { @@ -325,7 +281,7 @@ TEST_P(EnvMoreTestWithParam, MakeDir) { ASSERT_OK(env_->CreateDir(test_dir_ + "/j")); ASSERT_OK(env_->FileExists(test_dir_ + "/j")); std::vector children; - env_->GetChildren(test_dir_, &children); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); ASSERT_EQ(1U, children.size()); // fail because file already exists ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok()); @@ -374,6 +330,32 @@ TEST_P(EnvMoreTestWithParam, GetChildren) { ASSERT_EQ(0U, children.size()); } +TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) { + auto* env = Env::Default(); + ASSERT_OK(env->CreateDirIfMissing(test_dir_)); + + // Create a single file + std::string path = test_dir_; + const EnvOptions soptions; +#ifdef OS_WIN + path.append("\\test_file"); +#else + path.append("/test_file"); +#endif + std::string data("test data"); + std::unique_ptr file; + ASSERT_OK(env->NewWritableFile(path, &file, soptions)); + ASSERT_OK(file->Append("test data")); + + // get the children + std::vector result; + ASSERT_OK(env->GetChildren(test_dir_, &result)); + + // expect only one file named `test_data`, i.e. no `.` or `..` names + ASSERT_EQ(result.size(), 1); + ASSERT_EQ(result.at(0), "test_file"); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/env/env_chroot.cc b/env/env_chroot.cc index 4bc2f9a2507..ff47049e0f0 100644 --- a/env/env_chroot.cc +++ b/env/env_chroot.cc @@ -7,23 +7,21 @@ #include "env/env_chroot.h" -#include -#include -#include -#include +#include // errno +#include // realpath, free +#include // geteuid -#include -#include -#include - -#include "rocksdb/status.h" +#include "env/composite_env_wrapper.h" +#include "env/fs_remap.h" +#include "util/string_util.h" // errnoStr namespace ROCKSDB_NAMESPACE { - -class ChrootEnv : public EnvWrapper { +namespace { +class ChrootFileSystem : public RemapFileSystem { public: - ChrootEnv(Env* base_env, const std::string& chroot_dir) - : EnvWrapper(base_env) { + ChrootFileSystem(const std::shared_ptr& base, + const std::string& chroot_dir) + : RemapFileSystem(base) { #if defined(OS_AIX) char resolvedName[PATH_MAX]; char* real_chroot_dir = realpath(chroot_dir.c_str(), resolvedName); @@ -38,217 +36,10 @@ class ChrootEnv : public EnvWrapper { #endif } - Status RegisterDbPaths(const std::vector& paths) override { - std::vector encoded_paths; - encoded_paths.reserve(paths.size()); - for (auto& path : paths) { - auto status_and_enc_path = EncodePathWithNewBasename(path); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - encoded_paths.emplace_back(status_and_enc_path.second); - } - return EnvWrapper::Env::RegisterDbPaths(encoded_paths); - } - - Status UnregisterDbPaths(const std::vector& paths) override { - std::vector encoded_paths; - encoded_paths.reserve(paths.size()); - for (auto& path : paths) { - auto status_and_enc_path = EncodePathWithNewBasename(path); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - encoded_paths.emplace_back(status_and_enc_path.second); - } - return EnvWrapper::Env::UnregisterDbPaths(encoded_paths); - } - - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewSequentialFile(status_and_enc_path.second, result, - options); - } - - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewRandomAccessFile(status_and_enc_path.second, result, - options); - } - - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewWritableFile(status_and_enc_path.second, result, - options); - } - - Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - auto status_and_old_enc_path = EncodePath(old_fname); - if (!status_and_old_enc_path.first.ok()) { - return status_and_old_enc_path.first; - } - return EnvWrapper::ReuseWritableFile(status_and_old_enc_path.second, - status_and_old_enc_path.second, result, - options); - } - - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewRandomRWFile(status_and_enc_path.second, result, - options); - } - - Status NewDirectory(const std::string& dir, - std::unique_ptr* result) override { - auto status_and_enc_path = EncodePathWithNewBasename(dir); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewDirectory(status_and_enc_path.second, result); - } - - Status FileExists(const std::string& fname) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::FileExists(status_and_enc_path.second); - } - - Status GetChildren(const std::string& dir, - std::vector* result) override { - auto status_and_enc_path = EncodePath(dir); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetChildren(status_and_enc_path.second, result); - } - - Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override { - auto status_and_enc_path = EncodePath(dir); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetChildrenFileAttributes(status_and_enc_path.second, - result); - } - - Status DeleteFile(const std::string& fname) override { - auto status_and_enc_path = EncodePath(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::DeleteFile(status_and_enc_path.second); - } - - Status CreateDir(const std::string& dirname) override { - auto status_and_enc_path = EncodePathWithNewBasename(dirname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::CreateDir(status_and_enc_path.second); - } - - Status CreateDirIfMissing(const std::string& dirname) override { - auto status_and_enc_path = EncodePathWithNewBasename(dirname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::CreateDirIfMissing(status_and_enc_path.second); - } - - Status DeleteDir(const std::string& dirname) override { - auto status_and_enc_path = EncodePath(dirname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::DeleteDir(status_and_enc_path.second); - } + const char* Name() const override { return "ChrootFS"; } - Status GetFileSize(const std::string& fname, uint64_t* file_size) override { - auto status_and_enc_path = EncodePath(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetFileSize(status_and_enc_path.second, file_size); - } - - Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) override { - auto status_and_enc_path = EncodePath(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetFileModificationTime(status_and_enc_path.second, - file_mtime); - } - - Status RenameFile(const std::string& src, const std::string& dest) override { - auto status_and_src_enc_path = EncodePath(src); - if (!status_and_src_enc_path.first.ok()) { - return status_and_src_enc_path.first; - } - auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); - if (!status_and_dest_enc_path.first.ok()) { - return status_and_dest_enc_path.first; - } - return EnvWrapper::RenameFile(status_and_src_enc_path.second, - status_and_dest_enc_path.second); - } - - Status LinkFile(const std::string& src, const std::string& dest) override { - auto status_and_src_enc_path = EncodePath(src); - if (!status_and_src_enc_path.first.ok()) { - return status_and_src_enc_path.first; - } - auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); - if (!status_and_dest_enc_path.first.ok()) { - return status_and_dest_enc_path.first; - } - return EnvWrapper::LinkFile(status_and_src_enc_path.second, - status_and_dest_enc_path.second); - } - - Status LockFile(const std::string& fname, FileLock** lock) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - // FileLock subclasses may store path (e.g., PosixFileLock stores it). We - // can skip stripping the chroot directory from this path because callers - // shouldn't use it. - return EnvWrapper::LockFile(status_and_enc_path.second, lock); - } - - Status GetTestDirectory(std::string* path) override { + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override { // Adapted from PosixEnv's implementation since it doesn't provide a way to // create directory in the chroot. char buf[256]; @@ -256,36 +47,19 @@ class ChrootEnv : public EnvWrapper { *path = buf; // Directory may already exist, so ignore return - return CreateDirIfMissing(*path); - } - - Status NewLogger(const std::string& fname, - std::shared_ptr* result) override { - auto status_and_enc_path = EncodePathWithNewBasename(fname); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::NewLogger(status_and_enc_path.second, result); - } - - Status GetAbsolutePath(const std::string& db_path, - std::string* output_path) override { - auto status_and_enc_path = EncodePath(db_path); - if (!status_and_enc_path.first.ok()) { - return status_and_enc_path.first; - } - return EnvWrapper::GetAbsolutePath(status_and_enc_path.second, output_path); + return CreateDirIfMissing(*path, options, dbg); } - private: + protected: // Returns status and expanded absolute path including the chroot directory. // Checks whether the provided path breaks out of the chroot. If it returns // non-OK status, the returned path should not be used. - std::pair EncodePath(const std::string& path) { + std::pair EncodePath( + const std::string& path) override { if (path.empty() || path[0] != '/') { - return {Status::InvalidArgument(path, "Not an absolute path"), ""}; + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; } - std::pair res; + std::pair res; res.second = chroot_dir_ + path; #if defined(OS_AIX) char resolvedName[PATH_MAX]; @@ -294,14 +68,14 @@ class ChrootEnv : public EnvWrapper { char* normalized_path = realpath(res.second.c_str(), nullptr); #endif if (normalized_path == nullptr) { - res.first = Status::NotFound(res.second, strerror(errno)); + res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str()); } else if (strlen(normalized_path) < chroot_dir_.size() || strncmp(normalized_path, chroot_dir_.c_str(), chroot_dir_.size()) != 0) { - res.first = Status::IOError(res.second, - "Attempted to access path outside chroot"); + res.first = IOStatus::IOError(res.second, + "Attempted to access path outside chroot"); } else { - res.first = Status::OK(); + res.first = IOStatus::OK(); } #if !defined(OS_AIX) free(normalized_path); @@ -311,10 +85,10 @@ class ChrootEnv : public EnvWrapper { // Similar to EncodePath() except assumes the basename in the path hasn't been // created yet. - std::pair EncodePathWithNewBasename( - const std::string& path) { + std::pair EncodePathWithNewBasename( + const std::string& path) override { if (path.empty() || path[0] != '/') { - return {Status::InvalidArgument(path, "Not an absolute path"), ""}; + return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""}; } // Basename may be followed by trailing slashes size_t final_idx = path.find_last_not_of('/'); @@ -331,14 +105,23 @@ class ChrootEnv : public EnvWrapper { return status_and_enc_path; } + private: std::string chroot_dir_; }; +} // namespace + +std::shared_ptr NewChrootFileSystem( + const std::shared_ptr& base, const std::string& chroot_dir) { + return std::make_shared(base, chroot_dir); +} Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) { if (!base_env->FileExists(chroot_dir).ok()) { return nullptr; } - return new ChrootEnv(base_env, chroot_dir); + std::shared_ptr chroot_fs = + NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir); + return new CompositeEnvWrapper(base_env, chroot_fs); } } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_chroot.h b/env/env_chroot.h index cb5585b3b7d..fb5b70c4480 100644 --- a/env/env_chroot.h +++ b/env/env_chroot.h @@ -15,6 +15,9 @@ namespace ROCKSDB_NAMESPACE { // Returns an Env that translates paths such that the root directory appears to // be chroot_dir. chroot_dir should refer to an existing directory. +// +// This class has not been fully analyzed for providing strong security +// guarantees. Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir); } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_encryption.cc b/env/env_encryption.cc index ca2542abbb1..a5670ad780d 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -12,9 +12,12 @@ #include #include +#include "env/composite_env_wrapper.h" #include "env/env_encryption_ctr.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/convenience.h" +#include "rocksdb/io_status.h" +#include "rocksdb/system_clock.h" #include "util/aligned_buffer.h" #include "util/coding.h" #include "util/random.h" @@ -84,19 +87,24 @@ std::shared_ptr EncryptionProvider::NewCTRProvider( // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization -Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) { +IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { assert(scratch); - Status status = file_->Read(n, result, scratch); - if (!status.ok()) { - return status; + IOStatus io_s = file_->Read(n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; } { PERF_TIMER_GUARD(decrypt_data_nanos); - status = stream_->Decrypt(offset_, (char*)result->data(), result->size()); + io_s = status_to_io_status( + stream_->Decrypt(offset_, (char*)result->data(), result->size())); } - offset_ += result->size(); // We've already ready data from disk, so update - // offset_ even if decryption fails. - return status; + if (io_s.ok()) { + offset_ += result->size(); // We've already ready data from disk, so update + // offset_ even if decryption fails. + } + return io_s; } // Skip "n" bytes from the file. This is guaranteed to be no @@ -106,7 +114,7 @@ Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) { // file, and Skip will return OK. // // REQUIRES: External synchronization -Status EncryptedSequentialFile::Skip(uint64_t n) { +IOStatus EncryptedSequentialFile::Skip(uint64_t n) { auto status = file_->Skip(n); if (!status.ok()) { return status; @@ -130,26 +138,30 @@ size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. -Status EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) { +IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset, + size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } // Positioned Read for direct I/O // If Direct I/O enabled, offset, n, and scratch should be properly aligned -Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, - Slice* result, char* scratch) { +IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) { assert(scratch); offset += prefixLength_; // Skip prefix - auto status = file_->PositionedRead(offset, n, result, scratch); - if (!status.ok()) { - return status; + auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; } offset_ = offset + result->size(); { PERF_TIMER_GUARD(decrypt_data_nanos); - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); } - return status; + return io_s; } // Read up to "n" bytes from the file starting at "offset". @@ -162,25 +174,30 @@ Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. -Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { +IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const { assert(scratch); offset += prefixLength_; - auto status = file_->Read(offset, n, result, scratch); - if (!status.ok()) { - return status; + auto io_s = file_->Read(offset, n, options, result, scratch, dbg); + if (!io_s.ok()) { + return io_s; } { PERF_TIMER_GUARD(decrypt_data_nanos); - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); + io_s = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); } - return status; + return io_s; } // Readahead the file starting from offset by n bytes for caching. -Status EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n) { +IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, + const IOOptions& options, + IODebugContext* dbg) { // return Status::OK(); - return file_->Prefetch(offset + prefixLength_, n); + return file_->Prefetch(offset + prefixLength_, n, options, dbg); } // Tries to get an unique ID for this file that will be the same each time @@ -221,20 +238,21 @@ size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. -Status EncryptedRandomAccessFile::InvalidateCache(size_t offset, - size_t length) { +IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset, + size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } // A file abstraction for sequential writing. The implementation // must provide buffering since callers may append small fragments // at a time to the file. -Status EncryptedWritableFile::Append(const Slice& data) { +IOStatus EncryptedWritableFile::Append(const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { AlignedBuffer buf; - Status status; Slice dataToAppend(data); if (data.size() > 0) { - auto offset = file_->GetFileSize(); // size including prefix + auto offset = file_->GetFileSize(options, dbg); // size including prefix // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); @@ -242,26 +260,25 @@ Status EncryptedWritableFile::Append(const Slice& data) { // so that the next two lines can be replaced with buf.Append(). memmove(buf.BufferStart(), data.data(), data.size()); buf.Size(data.size()); + IOStatus io_s; { PERF_TIMER_GUARD(encrypt_data_nanos); - status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); } - if (!status.ok()) { - return status; + if (!io_s.ok()) { + return io_s; } dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } - status = file_->Append(dataToAppend); - if (!status.ok()) { - return status; - } - return status; + return file_->Append(dataToAppend, options, dbg); } -Status EncryptedWritableFile::PositionedAppend(const Slice& data, - uint64_t offset) { +IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data, + uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) { AlignedBuffer buf; - Status status; Slice dataToAppend(data); offset += prefixLength_; if (data.size() > 0) { @@ -270,20 +287,18 @@ Status EncryptedWritableFile::PositionedAppend(const Slice& data, buf.AllocateNewBuffer(data.size()); memmove(buf.BufferStart(), data.data(), data.size()); buf.Size(data.size()); + IOStatus io_s; { PERF_TIMER_GUARD(encrypt_data_nanos); - status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); } - if (!status.ok()) { - return status; + if (!io_s.ok()) { + return io_s; } dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } - status = file_->PositionedAppend(dataToAppend, offset); - if (!status.ok()) { - return status; - } - return status; + return file_->PositionedAppend(dataToAppend, offset, options, dbg); } // Indicates the upper layers if the current WritableFile implementation @@ -301,48 +316,83 @@ size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { /* * Get the size of valid data in the file. */ -uint64_t EncryptedWritableFile::GetFileSize() { - return file_->GetFileSize() - prefixLength_; +uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options, + IODebugContext* dbg) { + return file_->GetFileSize(options, dbg) - prefixLength_; } - // Truncate is necessary to trim the file to the correct size - // before closing. It is not always possible to keep track of the file - // size due to whole pages writes. The behavior is undefined if called - // with other writes to follow. -Status EncryptedWritableFile::Truncate(uint64_t size) { - return file_->Truncate(size + prefixLength_); +// Truncate is necessary to trim the file to the correct size +// before closing. It is not always possible to keep track of the file +// size due to whole pages writes. The behavior is undefined if called +// with other writes to follow. +IOStatus EncryptedWritableFile::Truncate(uint64_t size, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Truncate(size + prefixLength_, options, dbg); } - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. - // This call has no effect on dirty pages in the cache. -Status EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +// This call has no effect on dirty pages in the cache. +IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } - // Sync a file range with disk. - // offset is the starting byte of the file range to be synchronized. - // nbytes specifies the length of the range to be synchronized. - // This asks the OS to initiate flushing the cached data to disk, - // without waiting for completion. - // Default implementation does nothing. -Status EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { - return file_->RangeSync(offset + prefixLength_, nbytes); +// Sync a file range with disk. +// offset is the starting byte of the file range to be synchronized. +// nbytes specifies the length of the range to be synchronized. +// This asks the OS to initiate flushing the cached data to disk, +// without waiting for completion. +// Default implementation does nothing. +IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& options, + IODebugContext* dbg) { + return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg); +} + +// PrepareWrite performs any necessary preparation for a write +// before the write actually occurs. This allows for pre-allocation +// of space on devices where it can result in less file +// fragmentation and/or less waste from over-zealous filesystem +// pre-allocation. +void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len, + const IOOptions& options, + IODebugContext* dbg) { + file_->PrepareWrite(offset + prefixLength_, len, options, dbg); +} + +void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) { + // the size here doesn't need to include prefixLength_, as it's a + // configuration will be use for `PrepareWrite()`. + file_->SetPreallocationBlockSize(size); } - // PrepareWrite performs any necessary preparation for a write - // before the write actually occurs. This allows for pre-allocation - // of space on devices where it can result in less file - // fragmentation and/or less waste from over-zealous filesystem - // pre-allocation. -void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len) { - file_->PrepareWrite(offset + prefixLength_, len); +void EncryptedWritableFile::GetPreallocationStatus( + size_t* block_size, size_t* last_allocated_block) { + file_->GetPreallocationStatus(block_size, last_allocated_block); } - // Pre-allocates space for a file. -Status EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len) { - return file_->Allocate(offset + prefixLength_, len); +// Pre-allocates space for a file. +IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& options, + IODebugContext* dbg) { + return file_->Allocate(offset + prefixLength_, len, options, dbg); +} + +IOStatus EncryptedWritableFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} + +IOStatus EncryptedWritableFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} + +IOStatus EncryptedWritableFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); } // A file abstraction for random reading and writing. @@ -361,9 +411,10 @@ size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. -Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { +IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& options, + IODebugContext* dbg) { AlignedBuffer buf; - Status status; Slice dataToWrite(data); offset += prefixLength_; if (data.size() > 0) { @@ -372,71 +423,89 @@ Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { buf.AllocateNewBuffer(data.size()); memmove(buf.BufferStart(), data.data(), data.size()); buf.Size(data.size()); + IOStatus io_s; { PERF_TIMER_GUARD(encrypt_data_nanos); - status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); + io_s = status_to_io_status( + stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize())); } - if (!status.ok()) { - return status; + if (!io_s.ok()) { + return io_s; } dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); } - status = file_->Write(offset, dataToWrite); - return status; + return file_->Write(offset, dataToWrite, options, dbg); } // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. // Returns Status::OK() on success. -Status EncryptedRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { +IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) const { assert(scratch); offset += prefixLength_; - auto status = file_->Read(offset, n, result, scratch); + auto status = file_->Read(offset, n, options, result, scratch, dbg); if (!status.ok()) { return status; } { PERF_TIMER_GUARD(decrypt_data_nanos); - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); + status = status_to_io_status( + stream_->Decrypt(offset, (char*)result->data(), result->size())); } return status; } -Status EncryptedRandomRWFile::Flush() { return file_->Flush(); } +IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options, + IODebugContext* dbg) { + return file_->Flush(options, dbg); +} -Status EncryptedRandomRWFile::Sync() { return file_->Sync(); } +IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Sync(options, dbg); +} -Status EncryptedRandomRWFile::Fsync() { return file_->Fsync(); } +IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options, + IODebugContext* dbg) { + return file_->Fsync(options, dbg); +} -Status EncryptedRandomRWFile::Close() { return file_->Close(); } +IOStatus EncryptedRandomRWFile::Close(const IOOptions& options, + IODebugContext* dbg) { + return file_->Close(options, dbg); +} -// EncryptedEnv implements an Env wrapper that adds encryption to files stored -// on disk. -class EncryptedEnvImpl : public EnvWrapper { +namespace { +// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption +// to files stored on disk. +class EncryptedFileSystemImpl : public EncryptedFileSystem { + public: + const char* Name() const override { return "EncryptedFS"; } // Returns the raw encryption provider that should be used to write the input // encrypted file. If there is no such provider, NotFound is returned. - Status GetWritableProvider(const std::string& /*fname*/, - EncryptionProvider** result) { + IOStatus GetWritableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { if (provider_) { *result = provider_.get(); - return Status::OK(); + return IOStatus::OK(); } else { *result = nullptr; - return Status::NotFound("No WriteProvider specified"); + return IOStatus::NotFound("No WriteProvider specified"); } } // Returns the raw encryption provider that should be used to read the input // encrypted file. If there is no such provider, NotFound is returned. - Status GetReadableProvider(const std::string& /*fname*/, - EncryptionProvider** result) { + IOStatus GetReadableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { if (provider_) { *result = provider_.get(); - return Status::OK(); + return IOStatus::OK(); } else { *result = nullptr; - return Status::NotFound("No Provider specified"); + return IOStatus::NotFound("No Provider specified"); } } @@ -452,13 +521,13 @@ class EncryptedEnvImpl : public EnvWrapper { // should be encrypted // @return OK on success, non-OK on failure. template - Status CreateWritableCipherStream( + IOStatus CreateWritableCipherStream( const std::string& fname, const std::unique_ptr& underlying, - const EnvOptions& options, size_t* prefix_length, - std::unique_ptr* stream) { + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { EncryptionProvider* provider = nullptr; *prefix_length = 0; - Status status = GetWritableProvider(fname, &provider); + IOStatus status = GetWritableProvider(fname, &provider); if (!status.ok()) { return status; } else if (provider != nullptr) { @@ -470,34 +539,36 @@ class EncryptedEnvImpl : public EnvWrapper { // Initialize prefix buffer.Alignment(underlying->GetRequiredBufferAlignment()); buffer.AllocateNewBuffer(*prefix_length); - status = provider->CreateNewPrefix(fname, buffer.BufferStart(), - *prefix_length); + status = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); if (status.ok()) { buffer.Size(*prefix_length); prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); // Write prefix - status = underlying->Append(prefix); + status = underlying->Append(prefix, options.io_options, dbg); } if (!status.ok()) { return status; } } // Create cipher stream - status = provider->CreateCipherStream(fname, options, prefix, stream); + status = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); } return status; } template - Status CreateWritableEncryptedFile(const std::string& fname, - std::unique_ptr& underlying, - const EnvOptions& options, - std::unique_ptr* result) { + IOStatus CreateWritableEncryptedFile(const std::string& fname, + std::unique_ptr& underlying, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { // Create cipher stream std::unique_ptr stream; size_t prefix_length; - Status status = CreateWritableCipherStream(fname, underlying, options, - &prefix_length, &stream); + IOStatus status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream, dbg); if (status.ok()) { if (stream) { result->reset(new EncryptedWritableFile( @@ -521,15 +592,15 @@ class EncryptedEnvImpl : public EnvWrapper { // should be encrypted // @return OK on success, non-OK on failure. template - Status CreateRandomWriteCipherStream( + IOStatus CreateRandomWriteCipherStream( const std::string& fname, const std::unique_ptr& underlying, - const EnvOptions& options, size_t* prefix_length, - std::unique_ptr* stream) { + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { EncryptionProvider* provider = nullptr; *prefix_length = 0; - Status status = GetWritableProvider(fname, &provider); - if (!status.ok()) { - return status; + IOStatus io_s = GetWritableProvider(fname, &provider); + if (!io_s.ok()) { + return io_s; } else if (provider != nullptr) { // Initialize & write prefix (if needed) AlignedBuffer buffer; @@ -539,22 +610,23 @@ class EncryptedEnvImpl : public EnvWrapper { // Initialize prefix buffer.Alignment(underlying->GetRequiredBufferAlignment()); buffer.AllocateNewBuffer(*prefix_length); - status = provider->CreateNewPrefix(fname, buffer.BufferStart(), - *prefix_length); - if (status.ok()) { + io_s = status_to_io_status(provider->CreateNewPrefix( + fname, buffer.BufferStart(), *prefix_length)); + if (io_s.ok()) { buffer.Size(*prefix_length); prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); // Write prefix - status = underlying->Write(0, prefix); + io_s = underlying->Write(0, prefix, options.io_options, dbg); } - if (!status.ok()) { - return status; + if (!io_s.ok()) { + return io_s; } } // Create cipher stream - status = provider->CreateCipherStream(fname, options, prefix, stream); + io_s = status_to_io_status( + provider->CreateCipherStream(fname, options, prefix, stream)); } - return status; + return io_s; } // Creates a CipherStream for the underlying file/name using the options @@ -569,10 +641,10 @@ class EncryptedEnvImpl : public EnvWrapper { // is encrypted // @return OK on success, non-OK on failure. template - Status CreateSequentialCipherStream( + IOStatus CreateSequentialCipherStream( const std::string& fname, const std::unique_ptr& underlying, - const EnvOptions& options, size_t* prefix_length, - std::unique_ptr* stream) { + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { // Read prefix (if needed) AlignedBuffer buffer; Slice prefix; @@ -581,14 +653,15 @@ class EncryptedEnvImpl : public EnvWrapper { // Read prefix buffer.Alignment(underlying->GetRequiredBufferAlignment()); buffer.AllocateNewBuffer(*prefix_length); - Status status = - underlying->Read(*prefix_length, &prefix, buffer.BufferStart()); + IOStatus status = underlying->Read(*prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); if (!status.ok()) { return status; } buffer.Size(*prefix_length); } - return provider_->CreateCipherStream(fname, options, prefix, stream); + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); } // Creates a CipherStream for the underlying file/name using the options @@ -603,10 +676,10 @@ class EncryptedEnvImpl : public EnvWrapper { // is encrypted // @return OK on success, non-OK on failure. template - Status CreateRandomReadCipherStream( + IOStatus CreateRandomReadCipherStream( const std::string& fname, const std::unique_ptr& underlying, - const EnvOptions& options, size_t* prefix_length, - std::unique_ptr* stream) { + const FileOptions& options, size_t* prefix_length, + std::unique_ptr* stream, IODebugContext* dbg) { // Read prefix (if needed) AlignedBuffer buffer; Slice prefix; @@ -615,42 +688,60 @@ class EncryptedEnvImpl : public EnvWrapper { // Read prefix buffer.Alignment(underlying->GetRequiredBufferAlignment()); buffer.AllocateNewBuffer(*prefix_length); - Status status = - underlying->Read(0, *prefix_length, &prefix, buffer.BufferStart()); + IOStatus status = underlying->Read(0, *prefix_length, options.io_options, + &prefix, buffer.BufferStart(), dbg); if (!status.ok()) { return status; } buffer.Size(*prefix_length); } - return provider_->CreateCipherStream(fname, options, prefix, stream); + return status_to_io_status( + provider_->CreateCipherStream(fname, options, prefix, stream)); } public: - EncryptedEnvImpl(Env* base_env, - const std::shared_ptr& provider) - : EnvWrapper(base_env) { + EncryptedFileSystemImpl(const std::shared_ptr& base, + const std::shared_ptr& provider) + : EncryptedFileSystem(base) { provider_ = provider; } + Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) override { + return provider_->AddCipher(descriptor, cipher, len, for_write); + } + // NewSequentialFile opens a file for sequential reading. - virtual Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_reads) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options); + std::unique_ptr underlying; + auto status = + FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } + uint64_t file_size; + status = FileSystemWrapper::GetFileSize(fname, options.io_options, + &file_size, dbg); + if (!status.ok()) { + return status; + } + if (!file_size) { + *result = std::move(underlying); + return status; + } // Create cipher stream std::unique_ptr stream; size_t prefix_length; status = CreateSequentialCipherStream(fname, underlying, options, - &prefix_length, &stream); + &prefix_length, &stream, dbg); if (status.ok()) { result->reset(new EncryptedSequentialFile( std::move(underlying), std::move(stream), prefix_length)); @@ -659,23 +750,25 @@ class EncryptedEnvImpl : public EnvWrapper { } // NewRandomAccessFile opens a file for random read access. - virtual Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_reads) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options); + std::unique_ptr underlying; + auto status = FileSystemWrapper::NewRandomAccessFile(fname, options, + &underlying, dbg); if (!status.ok()) { return status; } std::unique_ptr stream; size_t prefix_length; status = CreateRandomReadCipherStream(fname, underlying, options, - &prefix_length, &stream); + &prefix_length, &stream, dbg); if (status.ok()) { if (stream) { result->reset(new EncryptedRandomAccessFile( @@ -688,20 +781,21 @@ class EncryptedEnvImpl : public EnvWrapper { } // NewWritableFile opens a file for sequential writing. - virtual Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::NewWritableFile(fname, &underlying, options); + std::unique_ptr underlying; + IOStatus status = + FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - return CreateWritableEncryptedFile(fname, underlying, options, result); + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } // Create an object that writes to a new file with the specified @@ -711,39 +805,42 @@ class EncryptedEnvImpl : public EnvWrapper { // returns non-OK. // // The returned file will only be accessed by one thread at a time. - virtual Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::ReopenWritableFile(fname, &underlying, options); + std::unique_ptr underlying; + IOStatus status = + FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } - return CreateWritableEncryptedFile(fname, underlying, options, result); + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } // Reuse an existing file by renaming it and opening it as writable. - virtual Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = - EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); + std::unique_ptr underlying; + auto status = FileSystemWrapper::ReuseWritableFile( + fname, old_fname, options, &underlying, dbg); if (!status.ok()) { return status; } - return CreateWritableEncryptedFile(fname, underlying, options, result); + return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } // Open `fname` for random read and write, if file doesn't exist the file @@ -751,19 +848,20 @@ class EncryptedEnvImpl : public EnvWrapper { // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. - virtual Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (options.use_mmap_reads || options.use_mmap_writes) { - return Status::InvalidArgument(); + return IOStatus::InvalidArgument(); } // Check file exists - bool isNewFile = !FileExists(fname).ok(); + bool isNewFile = !FileExists(fname, options.io_options, dbg).ok(); // Open file using underlying Env implementation - std::unique_ptr underlying; - Status status = EnvWrapper::NewRandomRWFile(fname, &underlying, options); + std::unique_ptr underlying; + auto status = + FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg); if (!status.ok()) { return status; } @@ -773,10 +871,10 @@ class EncryptedEnvImpl : public EnvWrapper { if (!isNewFile) { // File already exists, read prefix status = CreateRandomReadCipherStream(fname, underlying, options, - &prefix_length, &stream); + &prefix_length, &stream, dbg); } else { status = CreateRandomWriteCipherStream(fname, underlying, options, - &prefix_length, &stream); + &prefix_length, &stream, dbg); } if (status.ok()) { if (stream) { @@ -803,9 +901,12 @@ class EncryptedEnvImpl : public EnvWrapper { // have // permission to access "dir", or if "dir" is invalid. // IOError if an IO Error was encountered - virtual Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override { - auto status = EnvWrapper::GetChildrenFileAttributes(dir, result); + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg); if (!status.ok()) { return status; } @@ -823,14 +924,15 @@ class EncryptedEnvImpl : public EnvWrapper { it->size_bytes -= provider->GetPrefixLength(); } } - return Status::OK(); + return IOStatus::OK(); } // Store the size of fname in *file_size. - virtual Status GetFileSize(const std::string& fname, - uint64_t* file_size) override { - auto status = EnvWrapper::GetFileSize(fname, file_size); - if (!status.ok()) { + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override { + auto status = + FileSystemWrapper::GetFileSize(fname, options, file_size, dbg); + if (!status.ok() || !(*file_size)) { return status; } EncryptionProvider* provider; @@ -846,12 +948,19 @@ class EncryptedEnvImpl : public EnvWrapper { private: std::shared_ptr provider_; }; +} // namespace +std::shared_ptr NewEncryptedFS( + const std::shared_ptr& base, + const std::shared_ptr& provider) { + return std::make_shared(base, provider); +} // Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. Env* NewEncryptedEnv(Env* base_env, const std::shared_ptr& provider) { - return new EncryptedEnvImpl(base_env, provider); + return new CompositeEnvWrapper( + base_env, NewEncryptedFS(base_env->GetFileSystem(), provider)); } // Encrypt one or more (partial) blocks of data at the file offset. @@ -1054,7 +1163,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, return Status::InvalidArgument("Encryption Cipher is missing"); } // Create & seed rnd. - Random rnd((uint32_t)Env::Default()->NowMicros()); + Random rnd((uint32_t)SystemClock::Default()->NowMicros()); // Fill entire prefix block with random values. for (size_t i = 0; i < prefixLength; i++) { prefix[i] = rnd.Uniform(256) & 0xFF; diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc index 3323eeb8af3..e0443dd944d 100644 --- a/env/env_hdfs.cc +++ b/env/env_hdfs.cc @@ -37,10 +37,10 @@ namespace { // Log error message static Status IOError(const std::string& context, int err_number) { return (err_number == ENOSPC) - ? Status::NoSpace(context, strerror(err_number)) + ? Status::NoSpace(context, errnoStr(err_number).c_str()) : (err_number == ENOENT) - ? Status::PathNotFound(context, strerror(err_number)) - : Status::IOError(context, strerror(err_number)); + ? Status::PathNotFound(context, errnoStr(err_number).c_str()) + : Status::IOError(context, errnoStr(err_number).c_str()); } // assume that there is one global logger for now. It is not thread-safe, @@ -213,6 +213,8 @@ class HdfsWritableFile: public WritableFile { } } + using WritableFile::Append; + // If the file was successfully created, then this returns true. // Otherwise returns false. bool isValid() { diff --git a/env/env_posix.cc b/env/env_posix.cc index fd7cbec22d3..fdcb6f6a31a 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -16,9 +16,6 @@ #include #include -#if defined(OS_LINUX) -#include -#endif #if defined(ROCKSDB_IOURING_PRESENT) #include #endif @@ -27,13 +24,10 @@ #include #include #include -#include #include #include #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include -#include -#include #endif #include #include @@ -58,13 +52,14 @@ #include "env/composite_env_wrapper.h" #include "env/io_posix.h" -#include "logging/logging.h" #include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" #include "port/port.h" +#include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" @@ -129,14 +124,86 @@ class PosixDynamicLibrary : public DynamicLibrary { }; #endif // !ROCKSDB_NO_DYNAMIC_EXTENSION -class PosixEnv : public CompositeEnvWrapper { +class PosixClock : public SystemClock { public: - // This constructor is for constructing non-default Envs, mainly by - // NewCompositeEnv(). It allows new instances to share the same - // threadpool and other resources as the default Env, while allowing - // a non-default FileSystem implementation - PosixEnv(const PosixEnv* default_env, std::shared_ptr fs); + const char* Name() const override { return "PosixClock"; } + uint64_t NowMicros() override { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + uint64_t NowNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(OS_SOLARIS) + return gethrtime(); +#elif defined(__MACH__) + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#else + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif + } + uint64_t CPUMicros() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000; +#endif + return 0; + } + + uint64_t CPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + + void SleepForMicroseconds(int micros) override { usleep(micros); } + + Status GetCurrentTime(int64_t* unix_time) override { + time_t ret = time(nullptr); + if (ret == (time_t)-1) { + return IOError("GetCurrentTime", "", errno); + } + *unix_time = (int64_t)ret; + return Status::OK(); + } + + std::string TimeToString(uint64_t secondsSince1970) override { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + localtime_r(&seconds, &t); + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + return dummy; + } +}; + +class PosixEnv : public CompositeEnv { + public: + PosixEnv(const PosixEnv* default_env, const std::shared_ptr& fs); ~PosixEnv() override { if (this == Env::Default()) { for (const auto tid : threads_to_join_) { @@ -244,50 +311,11 @@ class PosixEnv : public CompositeEnvWrapper { uint64_t GetThreadID() const override { return gettid(pthread_self()); } - uint64_t NowMicros() override { - struct timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; - } - - uint64_t NowNanos() override { -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ - defined(OS_AIX) - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#elif defined(OS_SOLARIS) - return gethrtime(); -#elif defined(__MACH__) - clock_serv_t cclock; - mach_timespec_t ts; - host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); - clock_get_time(cclock, &ts); - mach_port_deallocate(mach_task_self(), cclock); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#else - return std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()).count(); -#endif - } - - uint64_t NowCPUNanos() override { -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ - defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) - struct timespec ts; - clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#endif - return 0; - } - - void SleepForMicroseconds(int micros) override { usleep(micros); } - Status GetHostName(char* name, uint64_t len) override { int ret = gethostname(name, static_cast(len)); if (ret < 0) { if (errno == EFAULT || errno == EINVAL) { - return Status::InvalidArgument(strerror(errno)); + return Status::InvalidArgument(errnoStr(errno).c_str()); } else { return IOError("GetHostName", name, errno); } @@ -295,15 +323,6 @@ class PosixEnv : public CompositeEnvWrapper { return Status::OK(); } - Status GetCurrentTime(int64_t* unix_time) override { - time_t ret = time(nullptr); - if (ret == (time_t) -1) { - return IOError("GetCurrentTime", "", errno); - } - *unix_time = (int64_t) ret; - return Status::OK(); - } - ThreadStatusUpdater* GetThreadStatusUpdater() const override { return Env::GetThreadStatusUpdater(); } @@ -352,26 +371,6 @@ class PosixEnv : public CompositeEnvWrapper { return Status::OK(); } - std::string TimeToString(uint64_t secondsSince1970) override { - const time_t seconds = (time_t)secondsSince1970; - struct tm t; - int maxsize = 64; - std::string dummy; - dummy.reserve(maxsize); - dummy.resize(maxsize); - char* p = &dummy[0]; - localtime_r(&seconds, &t); - snprintf(p, maxsize, - "%04d/%02d/%02d-%02d:%02d:%02d ", - t.tm_year + 1900, - t.tm_mon + 1, - t.tm_mday, - t.tm_hour, - t.tm_min, - t.tm_sec); - return dummy; - } - private: friend Env* Env::Default(); // Constructs the default Env, a singleton @@ -394,7 +393,7 @@ class PosixEnv : public CompositeEnvWrapper { }; PosixEnv::PosixEnv() - : CompositeEnvWrapper(this, FileSystem::Default()), + : CompositeEnv(FileSystem::Default(), SystemClock::Default()), thread_pools_storage_(Priority::TOTAL), allow_non_owner_access_storage_(true), thread_pools_(thread_pools_storage_), @@ -411,12 +410,13 @@ PosixEnv::PosixEnv() thread_status_updater_ = CreateThreadStatusUpdater(); } -PosixEnv::PosixEnv(const PosixEnv* default_env, std::shared_ptr fs) - : CompositeEnvWrapper(this, fs), - thread_pools_(default_env->thread_pools_), - mu_(default_env->mu_), - threads_to_join_(default_env->threads_to_join_), - allow_non_owner_access_(default_env->allow_non_owner_access_) { +PosixEnv::PosixEnv(const PosixEnv* default_env, + const std::shared_ptr& fs) + : CompositeEnv(fs, default_env->GetSystemClock()), + thread_pools_(default_env->thread_pools_), + mu_(default_env->mu_), + threads_to_join_(default_env->threads_to_join_), + allow_non_owner_access_(default_env->allow_non_owner_access_) { thread_status_updater_ = default_env->thread_status_updater_; } @@ -470,11 +470,12 @@ void PosixEnv::WaitForJoin() { std::string Env::GenerateUniqueId() { std::string uuid_file = "/proc/sys/kernel/random/uuid"; + std::shared_ptr fs = FileSystem::Default(); - Status s = FileExists(uuid_file); + Status s = fs->FileExists(uuid_file, IOOptions(), nullptr); if (s.ok()) { std::string uuid; - s = ReadFileToString(this, uuid_file, &uuid); + s = ReadFileToString(fs.get(), uuid_file, &uuid); if (s.ok()) { return uuid; } @@ -514,11 +515,19 @@ Env* Env::Default() { return &default_env; } -std::unique_ptr NewCompositeEnv(std::shared_ptr fs) { +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs) { PosixEnv* default_env = static_cast(Env::Default()); return std::unique_ptr(new PosixEnv(default_env, fs)); } +// +// Default Posix SystemClock +// +const std::shared_ptr& SystemClock::Default() { + static std::shared_ptr default_clock = + std::make_shared(); + return default_clock; +} } // namespace ROCKSDB_NAMESPACE #endif diff --git a/env/env_test.cc b/env/env_test.cc index 660f210e46c..c7239047317 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -11,6 +11,11 @@ #include #endif +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#include +#endif + #include #include @@ -35,10 +40,12 @@ #include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" +#include "util/crc32c.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/string_util.h" @@ -89,6 +96,11 @@ class EnvPosixTest : public testing::Test { Env* env_; bool direct_io_; EnvPosixTest() : env_(Env::Default()), direct_io_(false) {} + ~EnvPosixTest() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({}); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } }; class EnvPosixTestWithParam @@ -913,7 +925,7 @@ class IoctlFriendlyTmpdir { } else { // mkdtemp failed: diagnose it, but don't give up. fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(), - strerror(errno)); + errnoStr(errno).c_str()); } } @@ -1038,7 +1050,8 @@ TEST_P(EnvPosixTestWithParam, AllocateTest) { int err_number = 0; if (alloc_status != 0) { err_number = errno; - fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number)); + fprintf(stderr, "Warning: fallocate() fails, %s\n", + errnoStr(err_number).c_str()); } close(fd); ASSERT_OK(env_->DeleteFile(fname_test_fallocate)); @@ -1265,7 +1278,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) { } TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { - // In this test we don't do aligned read, wo it doesn't work for + // In this test we don't do aligned read, so it doesn't work for // direct I/O case. EnvOptions soptions; soptions.use_direct_reads = soptions.use_direct_writes = false; @@ -1356,6 +1369,121 @@ TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { } } +#if defined(ROCKSDB_IOURING_PRESENT) +void GenerateFilesAndRequest(Env* env, const std::string& fname, + std::vector* ret_reqs, + std::vector* scratches) { + const size_t kTotalSize = 81920; + Random rnd(301); + std::string expected_data = rnd.RandomString(kTotalSize); + + // Create file. + { + std::unique_ptr wfile; + ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions())); + ASSERT_OK(wfile->Append(expected_data)); + ASSERT_OK(wfile->Close()); + } + + // Right now kIoUringDepth is hard coded as 256, so we need very large + // number of keys to cover the case of multiple rounds of submissions. + // Right now the test latency is still acceptable. If it ends up with + // too long, we can modify the io uring depth with SyncPoint here. + const int num_reads = 3; + std::vector offsets = {10000, 20000, 30000}; + std::vector lens = {3000, 200, 100}; + + // Create requests + scratches->reserve(num_reads); + std::vector& reqs = *ret_reqs; + reqs.resize(num_reads); + for (int i = 0; i < num_reads; ++i) { + reqs[i].offset = offsets[i]; + reqs[i].len = lens[i]; + scratches->emplace_back(reqs[i].len, ' '); + reqs[i].scratch = const_cast(scratches->back().data()); + } +} + +TEST_F(EnvPosixTest, MultiReadIOUringError) { + // In this test we don't do aligned read, so we can't do direct I/O. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + std::vector scratches; + std::vector reqs; + GenerateFilesAndRequest(env_, fname, &reqs, &scratches); + // Query the data + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + + bool io_uring_wait_cqe_called = false; + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", + [&](void* arg) { + if (!io_uring_wait_cqe_called) { + io_uring_wait_cqe_called = true; + ssize_t& ret = *(static_cast(arg)); + ret = 1; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = file->MultiRead(reqs.data(), reqs.size()); + if (io_uring_wait_cqe_called) { + ASSERT_NOK(s); + } else { + s.PermitUncheckedError(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(EnvPosixTest, MultiReadIOUringError2) { + // In this test we don't do aligned read, so we can't do direct I/O. + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + std::vector scratches; + std::vector reqs; + GenerateFilesAndRequest(env_, fname, &reqs, &scratches); + // Query the data + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + + bool io_uring_submit_and_wait_called = false; + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + [&](void* arg) { + io_uring_submit_and_wait_called = true; + ssize_t* ret = static_cast(arg); + (*ret)--; + }); + SyncPoint::GetInstance()->SetCallBack( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + [&](void* arg) { + struct io_uring* iu = static_cast(arg); + struct io_uring_cqe* cqe; + assert(io_uring_wait_cqe(iu, &cqe) == 0); + io_uring_cqe_seen(iu, cqe); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = file->MultiRead(reqs.data(), reqs.size()); + if (io_uring_submit_and_wait_called) { + ASSERT_NOK(s); + } else { + s.PermitUncheckedError(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // ROCKSDB_IOURING_PRESENT + // Only works in linux platforms #ifdef OS_WIN TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) { @@ -1667,12 +1795,26 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) { return Status::OK(); } + Status Append( + const Slice& /*data*/, + const DataVerificationInfo& /* verification_info */) override { + inc(1); + return Status::OK(); + } + Status PositionedAppend(const Slice& /*data*/, uint64_t /*offset*/) override { inc(2); return Status::OK(); } + Status PositionedAppend( + const Slice& /*data*/, uint64_t /*offset*/, + const DataVerificationInfo& /* verification_info */) override { + inc(2); + return Status::OK(); + } + Status Truncate(uint64_t /*size*/) override { inc(3); return Status::OK(); @@ -2051,6 +2193,26 @@ TEST_F(EnvTest, Close) { delete env; } +class LogvWithInfoLogLevelLogger : public Logger { + public: + using Logger::Logv; + void Logv(const InfoLogLevel /* log_level */, const char* /* format */, + va_list /* ap */) override {} +}; + +TEST_F(EnvTest, LogvWithInfoLogLevel) { + // Verifies the log functions work on a `Logger` that only overrides the + // `Logv()` overload including `InfoLogLevel`. + const std::string kSampleMessage("sample log message"); + LogvWithInfoLogLevelLogger logger; + ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str()); + ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str()); +} + INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam, ::testing::Values(std::pair(Env::Default(), false))); @@ -2146,7 +2308,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) { ASSERT_OK(db->Close()); delete db; - DestroyDB(dbname, opts); + ASSERT_OK(DestroyDB(dbname, opts)); dbname = dbname2_; } @@ -2193,7 +2355,8 @@ TEST_F(EnvTest, IsDirectory) { ASSERT_OK(s); std::unique_ptr fwriter; fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path, - FileOptions(), Env::Default())); + FileOptions(), + SystemClock::Default().get())); constexpr char buf[] = "test"; s = fwriter->Append(buf); ASSERT_OK(s); @@ -2202,6 +2365,28 @@ TEST_F(EnvTest, IsDirectory) { ASSERT_FALSE(is_dir); } +TEST_F(EnvTest, EnvWriteVerificationTest) { + Status s = Env::Default()->CreateDirIfMissing(test_directory_); + const std::string test_file_path = test_directory_ + "file1"; + ASSERT_OK(s); + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + std::unique_ptr file; + s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions()); + ASSERT_OK(s); + + DataVerificationInfo v_info; + std::string test_data = "test"; + std::string checksum; + uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size()); + PutFixed32(&checksum, v_crc32c); + v_info.checksum = Slice(checksum); + s = file->Append(Slice(test_data), v_info); + ASSERT_OK(s); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/env/file_system.cc b/env/file_system.cc index d2fc06c6c95..a6a2f3388a6 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -3,9 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "env/composite_env_wrapper.h" #include "rocksdb/file_system.h" + +#include "env/composite_env_wrapper.h" #include "options/db_options.h" +#include "rocksdb/convenience.h" #include "rocksdb/utilities/object_registry.h" namespace ROCKSDB_NAMESPACE { @@ -16,10 +18,18 @@ FileSystem::~FileSystem() {} Status FileSystem::Load(const std::string& value, std::shared_ptr* result) { + return CreateFromString(ConfigOptions(), value, result); +} + +Status FileSystem::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result) { Status s; #ifndef ROCKSDB_LITE + (void)config_options; s = ObjectRegistry::NewInstance()->NewSharedObject(value, result); #else + (void)config_options; (void)result; s = Status::NotSupported("Cannot load FileSystem in LITE mode", value); #endif @@ -83,6 +93,14 @@ FileOptions FileSystem::OptimizeForCompactionTableRead( return optimized_file_options; } +FileOptions FileSystem::OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, bool should_sync) { std::unique_ptr file; @@ -129,13 +147,4 @@ IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, return s; } -#ifdef OS_WIN -std::shared_ptr FileSystem::Default() { - static LegacyFileSystemWrapper default_fs(Env::Default()); - static std::shared_ptr default_fs_ptr( - &default_fs, [](LegacyFileSystemWrapper*) {}); - return default_fs_ptr; -} -#endif - } // namespace ROCKSDB_NAMESPACE diff --git a/env/file_system_tracer.cc b/env/file_system_tracer.cc index ad15df822e7..9a85dd5e080 100644 --- a/env/file_system_tracer.cc +++ b/env/file_system_tracer.cc @@ -5,33 +5,108 @@ #include "env/file_system_tracer.h" -#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { +IOStatus FileSystemTracingWrapper::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + IOStatus FileSystemTracingWrapper::NewWritableFile( const std::string& fname, const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), fname); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& file_opts, std::unique_ptr* result, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = + target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::NewRandomRWFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FileSystemTracingWrapper::NewDirectory( const std::string& name, const IOOptions& io_opts, std::unique_ptr* result, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->NewDirectory(name, io_opts, result, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), name); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + name.substr(name.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } @@ -39,64 +114,69 @@ IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir, const IOOptions& io_opts, std::vector* r, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->GetChildren(dir, io_opts, r, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), dir); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dir.substr(dir.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->DeleteFile(fname, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), fname); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->CreateDir(dirname, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), dirname); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FileSystemTracingWrapper::CreateDirIfMissing( const std::string& dirname, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), dirname); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->DeleteDir(dirname, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__, - elapsed, s.ToString(), dirname); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + dirname.substr(dirname.find_last_of("/\\") + 1)); + io_tracer_->WriteIOOp(io_record, dbg); return s; } @@ -104,13 +184,33 @@ IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname, const IOOptions& options, uint64_t* file_size, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->GetFileSize(fname, options, file_size, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileNameAndFileSize, - __func__, elapsed, s.ToString(), fname, *file_size); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed, + s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname, + size_t size, + const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Truncate(fname, size, options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), + fname.substr(fname.find_last_of("/\\") + 1), size); + io_tracer_->WriteIOOp(io_record, dbg); return s; } @@ -118,40 +218,50 @@ IOStatus FSSequentialFileTracingWrapper::Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Read(n, options, result, scratch, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLen, __func__, - elapsed, s.ToString(), result->size()); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset, size_t length) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->InvalidateCache(offset, length); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), length, offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, + offset); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); return s; } IOStatus FSSequentialFileTracingWrapper::PositionedRead( uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->PositionedRead(offset, n, options, result, scratch, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), result->size(), - offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + result->size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); return s; } @@ -159,13 +269,17 @@ IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), n, offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); return s; } @@ -173,16 +287,19 @@ IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); uint64_t latency = elapsed; + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); for (size_t i = 0; i < num_reqs; i++) { - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, latency, reqs[i].status.ToString(), - reqs[i].len, reqs[i].offset); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record( + clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency, + reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset); + io_tracer_->WriteIOOp(io_record, dbg); } return s; } @@ -190,116 +307,142 @@ IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs, IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Prefetch(offset, n, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), n, offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset, size_t length) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->InvalidateCache(offset, length); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), length, + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, static_cast(offset)); - io_tracer_->WriteIOOp(io_record); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); return s; } IOStatus FSWritableFileTracingWrapper::Append(const Slice& data, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Append(data, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLen, __func__, - elapsed, s.ToString(), data.size()); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FSWritableFileTracingWrapper::PositionedAppend( const Slice& data, uint64_t offset, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->PositionedAppend(data, offset, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), data.size(), offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Truncate(size, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLen, __func__, - elapsed, s.ToString(), size); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, size, + 0 /*Offset*/); + io_tracer_->WriteIOOp(io_record, dbg); return s; } IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Close(options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOGeneral, __func__, - elapsed, s.ToString()); - io_tracer_->WriteIOOp(io_record); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); return s; } uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); uint64_t file_size = target()->GetFileSize(options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileNameAndFileSize, - "GetFileSize", elapsed, "OK", "" /* file_name */, - file_size); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOFileSize); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, "OK", file_name_, file_size); + io_tracer_->WriteIOOp(io_record, dbg); return file_size; } IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset, size_t length) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->InvalidateCache(offset, length); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), length, + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, length, static_cast(offset)); - io_tracer_->WriteIOOp(io_record); + io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/); return s; } IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data, const IOOptions& options, IODebugContext* dbg) { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Write(offset, data, options, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), data.size(), offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, + data.size(), offset); + io_tracer_->WriteIOOp(io_record, dbg); return s; } @@ -307,13 +450,69 @@ IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const { - StopWatchNano timer(Env::Default()); + StopWatchNano timer(clock_); timer.Start(); IOStatus s = target()->Read(offset, n, options, result, scratch, dbg); uint64_t elapsed = timer.ElapsedNanos(); - IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset, - __func__, elapsed, s.ToString(), n, offset); - io_tracer_->WriteIOOp(io_record); + uint64_t io_op_data = 0; + io_op_data |= (1 << IOTraceOp::kIOLen); + io_op_data |= (1 << IOTraceOp::kIOOffset); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data, + __func__, elapsed, s.ToString(), file_name_, n, + offset); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Flush(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Close(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Sync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); + return s; +} + +IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options, + IODebugContext* dbg) { + StopWatchNano timer(clock_); + timer.Start(); + IOStatus s = target()->Fsync(options, dbg); + uint64_t elapsed = timer.ElapsedNanos(); + IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, + 0 /*io_op_data*/, __func__, elapsed, s.ToString(), + file_name_); + io_tracer_->WriteIOOp(io_record, dbg); return s; } } // namespace ROCKSDB_NAMESPACE diff --git a/env/file_system_tracer.h b/env/file_system_tracer.h index 4f37cbb0f2b..da87797d30b 100644 --- a/env/file_system_tracer.h +++ b/env/file_system_tracer.h @@ -6,6 +6,7 @@ #pragma once #include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "trace_replay/io_tracer.h" namespace ROCKSDB_NAMESPACE { @@ -18,17 +19,44 @@ namespace ROCKSDB_NAMESPACE { // overridden. class FileSystemTracingWrapper : public FileSystemWrapper { public: - FileSystemTracingWrapper(std::shared_ptr t, - std::shared_ptr io_tracer) - : FileSystemWrapper(t), io_tracer_(io_tracer), env_(Env::Default()) {} + FileSystemTracingWrapper(const std::shared_ptr& t, + const std::shared_ptr& io_tracer) + : FileSystemWrapper(t), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()) {} ~FileSystemTracingWrapper() override {} + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, std::unique_ptr* result, IODebugContext* dbg) override; @@ -53,9 +81,12 @@ class FileSystemTracingWrapper : public FileSystemWrapper { IOStatus GetFileSize(const std::string& fname, const IOOptions& options, uint64_t* file_size, IODebugContext* dbg) override; + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + private: std::shared_ptr io_tracer_; - Env* env_; + SystemClock* clock_; }; // The FileSystemPtr is a wrapper class that takes pointer to storage systems @@ -103,10 +134,12 @@ class FileSystemPtr { class FSSequentialFileTracingWrapper : public FSSequentialFileWrapper { public: FSSequentialFileTracingWrapper(FSSequentialFile* t, - std::shared_ptr io_tracer) + std::shared_ptr io_tracer, + const std::string& file_name) : FSSequentialFileWrapper(t), io_tracer_(io_tracer), - env_(Env::Default()) {} + clock_(SystemClock::Default().get()), + file_name_(file_name) {} ~FSSequentialFileTracingWrapper() override {} @@ -121,7 +154,8 @@ class FSSequentialFileTracingWrapper : public FSSequentialFileWrapper { private: std::shared_ptr io_tracer_; - Env* env_; + SystemClock* clock_; + std::string file_name_; }; // The FSSequentialFilePtr is a wrapper class that takes pointer to storage @@ -133,10 +167,13 @@ class FSSequentialFilePtr { public: FSSequentialFilePtr() = delete; FSSequentialFilePtr(std::unique_ptr&& fs, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& file_name) : fs_(std::move(fs)), io_tracer_(io_tracer), - fs_tracer_(fs_.get(), io_tracer_) {} + fs_tracer_(fs_.get(), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} FSSequentialFile* operator->() const { if (io_tracer_ && io_tracer_->is_tracing_enabled()) { @@ -169,10 +206,12 @@ class FSSequentialFilePtr { class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileWrapper { public: FSRandomAccessFileTracingWrapper(FSRandomAccessFile* t, - std::shared_ptr io_tracer) + std::shared_ptr io_tracer, + const std::string& file_name) : FSRandomAccessFileWrapper(t), io_tracer_(io_tracer), - env_(Env::Default()) {} + clock_(SystemClock::Default().get()), + file_name_(file_name) {} ~FSRandomAccessFileTracingWrapper() override {} @@ -190,7 +229,9 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileWrapper { private: std::shared_ptr io_tracer_; - Env* env_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; }; // The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage @@ -201,10 +242,13 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileWrapper { class FSRandomAccessFilePtr { public: FSRandomAccessFilePtr(std::unique_ptr&& fs, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& file_name) : fs_(std::move(fs)), io_tracer_(io_tracer), - fs_tracer_(fs_.get(), io_tracer_) {} + fs_tracer_(fs_.get(), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} FSRandomAccessFile* operator->() const { if (io_tracer_ && io_tracer_->is_tracing_enabled()) { @@ -237,8 +281,12 @@ class FSRandomAccessFilePtr { class FSWritableFileTracingWrapper : public FSWritableFileWrapper { public: FSWritableFileTracingWrapper(FSWritableFile* t, - std::shared_ptr io_tracer) - : FSWritableFileWrapper(t), io_tracer_(io_tracer), env_(Env::Default()) {} + std::shared_ptr io_tracer, + const std::string& file_name) + : FSWritableFileWrapper(t), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} ~FSWritableFileTracingWrapper() override {} @@ -271,7 +319,9 @@ class FSWritableFileTracingWrapper : public FSWritableFileWrapper { private: std::shared_ptr io_tracer_; - Env* env_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; }; // The FSWritableFilePtr is a wrapper class that takes pointer to storage @@ -282,9 +332,13 @@ class FSWritableFileTracingWrapper : public FSWritableFileWrapper { class FSWritableFilePtr { public: FSWritableFilePtr(std::unique_ptr&& fs, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + const std::string& file_name) : fs_(std::move(fs)), io_tracer_(io_tracer) { - fs_tracer_.reset(new FSWritableFileTracingWrapper(fs_.get(), io_tracer_)); + fs_tracer_.reset(new FSWritableFileTracingWrapper( + fs_.get(), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */)); } FSWritableFile* operator->() const { @@ -324,8 +378,12 @@ class FSWritableFilePtr { class FSRandomRWFileTracingWrapper : public FSRandomRWFileWrapper { public: FSRandomRWFileTracingWrapper(FSRandomRWFile* t, - std::shared_ptr io_tracer) - : FSRandomRWFileWrapper(t), io_tracer_(io_tracer), env_(Env::Default()) {} + std::shared_ptr io_tracer, + const std::string& file_name) + : FSRandomRWFileWrapper(t), + io_tracer_(io_tracer), + clock_(SystemClock::Default().get()), + file_name_(file_name) {} ~FSRandomRWFileTracingWrapper() override {} @@ -336,9 +394,19 @@ class FSRandomRWFileTracingWrapper : public FSRandomRWFileWrapper { Slice* result, char* scratch, IODebugContext* dbg) const override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; + private: std::shared_ptr io_tracer_; - Env* env_; + SystemClock* clock_; + // Stores file name instead of full path. + std::string file_name_; }; // The FSRandomRWFilePtr is a wrapper class that takes pointer to storage @@ -349,10 +417,13 @@ class FSRandomRWFileTracingWrapper : public FSRandomRWFileWrapper { class FSRandomRWFilePtr { public: FSRandomRWFilePtr(std::unique_ptr&& fs, - std::shared_ptr io_tracer) + std::shared_ptr io_tracer, + const std::string& file_name) : fs_(std::move(fs)), io_tracer_(io_tracer), - fs_tracer_(fs_.get(), io_tracer_) {} + fs_tracer_(fs_.get(), io_tracer_, + file_name.substr(file_name.find_last_of("/\\") + + 1) /* pass file name */) {} FSRandomRWFile* operator->() const { if (io_tracer_ && io_tracer_->is_tracing_enabled()) { diff --git a/env/fs_posix.cc b/env/fs_posix.cc index 5fa3d8224f0..a3e360806b6 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -15,10 +15,6 @@ #endif #include #include - -#if defined(OS_LINUX) -#include -#endif #include #include #include @@ -29,13 +25,13 @@ #include #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) #include -#include #include #endif #include #include #include #include + #include // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) @@ -52,7 +48,6 @@ #include "env/composite_env_wrapper.h" #include "env/io_posix.h" -#include "logging/logging.h" #include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" @@ -86,9 +81,7 @@ inline mode_t GetDBFileMode(bool allow_non_owner_access) { return allow_non_owner_access ? 0644 : 0600; } -static uint64_t gettid() { - return Env::Default()->GetThreadID(); -} +static uint64_t gettid() { return Env::Default()->GetThreadID(); } // list of pathnames that are locked // Only used for error message. @@ -272,8 +265,7 @@ class PosixFileSystem : public FileSystem { } virtual IOStatus OpenWritableFile(const std::string& fname, - const FileOptions& options, - bool reopen, + const FileOptions& options, bool reopen, std::unique_ptr* result, IODebugContext* /*dbg*/) { result->reset(); @@ -556,26 +548,37 @@ class PosixFileSystem : public FileSystem { } IOStatus NewLogger(const std::string& fname, const IOOptions& /*opts*/, - std::shared_ptr* result, - IODebugContext* /*dbg*/) override { - FILE* f; + std::shared_ptr* result, + IODebugContext* /*dbg*/) override { + FILE* f = nullptr; + int fd; { IOSTATS_TIMER_GUARD(open_nanos); - f = fopen(fname.c_str(), - "w" + fd = open(fname.c_str(), + cloexec_flags(O_WRONLY | O_CREAT | O_TRUNC, nullptr), + GetDBFileMode(allow_non_owner_access_)); + if (fd != -1) { + f = fdopen(fd, + "w" #ifdef __GLIBC_PREREQ #if __GLIBC_PREREQ(2, 7) - "e" // glibc extension to enable O_CLOEXEC + "e" // glibc extension to enable O_CLOEXEC #endif #endif - ); + ); + } + } + if (fd == -1) { + result->reset(); + return status_to_io_status( + IOError("when open a file for new logger", fname, errno)); } if (f == nullptr) { + close(fd); result->reset(); return status_to_io_status( - IOError("when fopen a file for new logger", fname, errno)); + IOError("when fdopen a file for new logger", fname, errno)); } else { - int fd = fileno(f); #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024); #endif @@ -612,6 +615,7 @@ class PosixFileSystem : public FileSystem { std::vector* result, IODebugContext* /*dbg*/) override { result->clear(); + DIR* d = opendir(dir.c_str()); if (d == nullptr) { switch (errno) { @@ -623,11 +627,36 @@ class PosixFileSystem : public FileSystem { return IOError("While opendir", dir, errno); } } + + // reset errno before calling readdir() + errno = 0; struct dirent* entry; while ((entry = readdir(d)) != nullptr) { - result->push_back(entry->d_name); + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + entry->d_type == DT_DIR && + (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0); + if (!ignore) { + result->push_back(entry->d_name); + } + errno = 0; // reset errno if readdir() success + } + + // always attempt to close the dir + const auto pre_close_errno = errno; // errno may be modified by closedir + const int close_result = closedir(d); + + if (pre_close_errno != 0) { + // error occurred during readdir + return IOError("While readdir", dir, pre_close_errno); } - closedir(d); + + if (close_result != 0) { + // error occurred during closedir + return IOError("While closedir", dir, errno); + } + return IOStatus::OK(); } @@ -755,7 +784,9 @@ class PosixFileSystem : public FileSystem { LockHoldingInfo lhi; int64_t current_time = 0; // Ignore status code as the time is only used for error message. - Env::Default()->GetCurrentTime(¤t_time).PermitUncheckedError(); + SystemClock::Default() + ->GetCurrentTime(¤t_time) + .PermitUncheckedError(); lhi.acquire_time = current_time; lhi.acquiring_thread = Env::Default()->GetThreadID(); @@ -771,9 +802,9 @@ class PosixFileSystem : public FileSystem { // closed, all locks the process holds for that *file* are released const auto it_success = locked_files.insert({fname, lhi}); if (it_success.second == false) { + LockHoldingInfo prev_info = it_success.first->second; mutex_locked_files.Unlock(); errno = ENOLCK; - LockHoldingInfo& prev_info = it_success.first->second; // Note that the thread ID printed is the same one as the one in // posix logger, but posix logger prints it hex format. return IOError("lock hold by current process, acquire time " + @@ -841,7 +872,7 @@ class PosixFileSystem : public FileSystem { char the_path[256]; char* ret = getcwd(the_path, 256); if (ret == nullptr) { - return IOStatus::IOError(strerror(errno)); + return IOStatus::IOError(errnoStr(errno).c_str()); } *output_path = ret; @@ -875,7 +906,17 @@ class PosixFileSystem : public FileSystem { return IOError("While doing statvfs", fname, errno); } - *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + // sbuf.bfree is total free space available to root + // sbuf.bavail is total free space available to unprivileged user + // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id + if (geteuid()) { + // non-zero user is unprivileged, or -1 if error. take more conservative + // size + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail); + } else { + // root user can access all disk space + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + } return IOStatus::OK(); } @@ -904,7 +945,7 @@ class PosixFileSystem : public FileSystem { } FileOptions OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const override { + const DBOptions& db_options) const override { FileOptions optimized = file_options; optimized.use_mmap_writes = false; optimized.use_direct_writes = false; diff --git a/env/fs_readonly.h b/env/fs_readonly.h new file mode 100644 index 00000000000..89875106eec --- /dev/null +++ b/env/fs_readonly.h @@ -0,0 +1,104 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// A FileSystem wrapper that only allows read-only operation. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class ReadOnlyFileSystem : public FileSystemWrapper { + static inline IOStatus FailReadOnly() { + IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem"); + assert(s.GetRetryable() == false); + return s; + } + + public: + explicit ReadOnlyFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + + IOStatus NewWritableFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus ReuseWritableFile(const std::string& /*fname*/, + const std::string& /*old_fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewRandomRWFile(const std::string& /*fname*/, + const FileOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewDirectory(const std::string& /*dir*/, + const IOOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus DeleteFile(const std::string& /*fname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override { + // Allow if dir already exists + bool is_dir = false; + IOStatus s = IsDirectory(dirname, options, &is_dir, dbg); + if (s.ok() && is_dir) { + return s; + } else { + return FailReadOnly(); + } + } + IOStatus DeleteDir(const std::string& /*dirname*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/, + FileLock** /*lock*/, IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } + IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/, + std::shared_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return FailReadOnly(); + } +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/env/fs_remap.cc b/env/fs_remap.cc new file mode 100644 index 00000000000..026f83cd12a --- /dev/null +++ b/env/fs_remap.cc @@ -0,0 +1,306 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef ROCKSDB_LITE + +#include "env/fs_remap.h" + +namespace ROCKSDB_NAMESPACE { + +RemapFileSystem::RemapFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + +std::pair RemapFileSystem::EncodePathWithNewBasename( + const std::string& path) { + // No difference by default + return EncodePath(path); +} + +Status RemapFileSystem::RegisterDbPaths(const std::vector& paths) { + std::vector encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::RegisterDbPaths(encoded_paths); +} + +Status RemapFileSystem::UnregisterDbPaths( + const std::vector& paths) { + std::vector encoded_paths; + encoded_paths.reserve(paths.size()); + for (auto& path : paths) { + auto status_and_enc_path = EncodePathWithNewBasename(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + encoded_paths.emplace_back(status_and_enc_path.second); + } + return FileSystemWrapper::UnregisterDbPaths(encoded_paths); +} + +IOStatus RemapFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + auto status_and_old_enc_path = EncodePath(old_fname); + if (!status_and_old_enc_path.first.ok()) { + return status_and_old_enc_path.first; + } + return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second, + status_and_old_enc_path.second, + options, result, dbg); +} + +IOStatus RemapFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::NewDirectory(const std::string& dir, + const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewDirectory(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::FileExists(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::FileExists(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::GetChildren(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildren(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetChildrenFileAttributes( + const std::string& dir, const IOOptions& options, + std::vector* result, IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dir); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetChildrenFileAttributes( + status_and_enc_path.second, options, result, dbg); +} + +IOStatus RemapFileSystem::DeleteFile(const std::string& fname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::CreateDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second, + options, dbg); +} + +IOStatus RemapFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(dirname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg); +} + +IOStatus RemapFileSystem::GetFileSize(const std::string& fname, + const IOOptions& options, + uint64_t* file_size, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options, + file_size, dbg); +} + +IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second, + options, file_mtime, dbg); +} + +IOStatus RemapFileSystem::IsDirectory(const std::string& path, + const IOOptions& options, bool* is_dir, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options, + is_dir, dbg); +} + +IOStatus RemapFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::RenameFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& options, + IODebugContext* dbg) { + auto status_and_src_enc_path = EncodePath(src); + if (!status_and_src_enc_path.first.ok()) { + return status_and_src_enc_path.first; + } + auto status_and_dest_enc_path = EncodePathWithNewBasename(dest); + if (!status_and_dest_enc_path.first.ok()) { + return status_and_dest_enc_path.first; + } + return FileSystemWrapper::LinkFile(status_and_src_enc_path.second, + status_and_dest_enc_path.second, options, + dbg); +} + +IOStatus RemapFileSystem::LockFile(const std::string& fname, + const IOOptions& options, FileLock** lock, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + // FileLock subclasses may store path (e.g., PosixFileLock stores it). We + // can skip stripping the chroot directory from this path because callers + // shouldn't use it. + return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock, + dbg); +} + +IOStatus RemapFileSystem::NewLogger(const std::string& fname, + const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePathWithNewBasename(fname); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::NewLogger(status_and_enc_path.second, options, + result, dbg); +} + +IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) { + auto status_and_enc_path = EncodePath(db_path); + if (!status_and_enc_path.first.ok()) { + return status_and_enc_path.first; + } + return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options, + output_path, dbg); +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/env/fs_remap.h b/env/fs_remap.h new file mode 100644 index 00000000000..4975822f66d --- /dev/null +++ b/env/fs_remap.h @@ -0,0 +1,131 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_LITE + +#include + +#include "rocksdb/file_system.h" + +namespace ROCKSDB_NAMESPACE { + +// An abstract FileSystem wrapper that creates a view of an existing +// FileSystem by remapping names in some way. +// +// This class has not been fully analyzed for providing strong security +// guarantees. +class RemapFileSystem : public FileSystemWrapper { + public: + explicit RemapFileSystem(const std::shared_ptr& base); + + protected: + // Returns status and mapped-to path in the wrapped filesystem. + // If it returns non-OK status, the returned path should not be used. + virtual std::pair EncodePath( + const std::string& path) = 0; + + // Similar to EncodePath() except used in cases in which it is OK for + // no file or directory on 'path' to already exist, such as if the + // operation would create one. However, the parent of 'path' is expected + // to exist for the operation to succeed. + // Default implementation: call EncodePath + virtual std::pair EncodePathWithNewBasename( + const std::string& path); + + public: + // Left abstract: + // const char* Name() const override { ... } + + Status RegisterDbPaths(const std::vector& paths) override; + + Status UnregisterDbPaths(const std::vector& paths) override; + + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewDirectory(const std::string& dir, const IOOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus FileExists(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus GetChildrenFileAttributes(const std::string& dir, + const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + + IOStatus IsDirectory(const std::string& path, const IOOptions& options, + bool* is_dir, IODebugContext* dbg) override; + + IOStatus RenameFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LinkFile(const std::string& src, const std::string& dest, + const IOOptions& options, IODebugContext* dbg) override; + + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + + IOStatus NewLogger(const std::string& fname, const IOOptions& options, + std::shared_ptr* result, + IODebugContext* dbg) override; + + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE diff --git a/env/io_posix.cc b/env/io_posix.cc index 689d898120b..a041b32aa6a 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -27,7 +27,6 @@ #include #ifdef OS_LINUX #include -#include #include #endif #include "monitoring/iostats_context_imp.h" @@ -59,7 +58,7 @@ IOStatus IOError(const std::string& context, const std::string& file_name, switch (err_number) { case ENOSPC: { IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name), - strerror(err_number)); + errnoStr(err_number).c_str()); s.SetRetryable(true); return s; } @@ -67,10 +66,10 @@ IOStatus IOError(const std::string& context, const std::string& file_name, return IOStatus::IOError(IOStatus::kStaleFile); case ENOENT: return IOStatus::PathNotFound(IOErrorMsg(context, file_name), - strerror(err_number)); + errnoStr(err_number).c_str()); default: return IOStatus::IOError(IOErrorMsg(context, file_name), - strerror(err_number)); + errnoStr(err_number).c_str()); } } @@ -634,6 +633,8 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); } + IOStatus ios = IOStatus::OK(); + struct WrappedReadRequest { FSReadRequest* req; struct iovec iov; @@ -680,19 +681,47 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, ssize_t ret = io_uring_submit_and_wait(iu, static_cast(this_reqs)); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", + &ret); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", + iu); + if (static_cast(ret) != this_reqs) { fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); + // If error happens and we submitted fewer than expected, it is an + // exception case and we don't retry here. We should still consume + // what is is submitted in the ring. + for (ssize_t i = 0; i < ret; i++) { + struct io_uring_cqe* cqe = nullptr; + io_uring_wait_cqe(iu, &cqe); + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + } + return IOStatus::IOError("io_uring_submit_and_wait() requested " + + ToString(this_reqs) + " but returned " + + ToString(ret)); } - assert(static_cast(ret) == this_reqs); for (size_t i = 0; i < this_reqs; i++) { - struct io_uring_cqe* cqe; + struct io_uring_cqe* cqe = nullptr; WrappedReadRequest* req_wrap; // We could use the peek variant here, but this seems safer in terms // of our initial wait not reaping all completions ret = io_uring_wait_cqe(iu, &cqe); - assert(!ret); + TEST_SYNC_POINT_CALLBACK( + "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret); + if (ret) { + ios = IOStatus::IOError("io_uring_wait_cqe() returns " + ToString(ret)); + + if (cqe != nullptr) { + io_uring_cqe_seen(iu, cqe); + } + continue; + } req_wrap = static_cast(io_uring_cqe_get_data(cqe)); FSReadRequest* req = req_wrap->req; @@ -741,7 +770,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, io_uring_cqe_seen(iu, cqe); } } - return IOStatus::OK(); + return ios; #else return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); #endif @@ -894,7 +923,7 @@ IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { * knows enough to skip zero suffixes. */ IOStatus PosixMmapFile::UnmapCurrentRegion() { - TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); if (base_ != nullptr) { int munmap_status = munmap(base_, limit_ - base_); if (munmap_status != 0) { @@ -917,7 +946,7 @@ IOStatus PosixMmapFile::UnmapCurrentRegion() { IOStatus PosixMmapFile::MapNewRegion() { #ifdef ROCKSDB_FALLOCATE_PRESENT assert(base_ == nullptr); - TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); // we can't fallocate with FALLOC_FL_KEEP_SIZE here if (allow_fallocate_) { IOSTATS_TIMER_GUARD(allocate_nanos); @@ -928,17 +957,17 @@ IOStatus PosixMmapFile::MapNewRegion() { } if (alloc_status != 0) { return IOStatus::IOError("Error allocating space to file : " + filename_ + - "Error : " + strerror(alloc_status)); + "Error : " + errnoStr(alloc_status).c_str()); } } - TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Append:1"); void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, file_offset_); if (ptr == MAP_FAILED) { return IOStatus::IOError("MMap failed on " + filename_); } - TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Append:2"); base_ = reinterpret_cast(ptr); limit_ = base_ + map_size_; @@ -959,7 +988,7 @@ IOStatus PosixMmapFile::Msync() { size_t p1 = TruncateToPageBoundary(last_sync_ - base_); size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); last_sync_ = dst_; - TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Msync:0"); if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { return IOError("While msync", filename_, errno); } @@ -1012,7 +1041,7 @@ IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/, if (!s.ok()) { return s; } - TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Append:0"); } size_t n = (left <= avail) ? left : avail; @@ -1110,7 +1139,7 @@ IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len, IODebugContext* /*dbg*/) { assert(offset <= static_cast(std::numeric_limits::max())); assert(len <= static_cast(std::numeric_limits::max())); - TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixMmapFile::Allocate:0"); int alloc_status = 0; if (allow_fallocate_) { alloc_status = @@ -1214,6 +1243,7 @@ IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/, size_t block_size; size_t last_allocated_block; GetPreallocationStatus(&block_size, &last_allocated_block); + TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block); if (last_allocated_block > 0) { // trim the extra space preallocated at the end of the file // NOTE(ljin): we probably don't want to surface failure as an IOError, @@ -1333,7 +1363,7 @@ IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len, IODebugContext* /*dbg*/) { assert(offset <= static_cast(std::numeric_limits::max())); assert(len <= static_cast(std::numeric_limits::max())); - TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("PosixWritableFile::Allocate:0"); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; if (allow_fallocate_) { diff --git a/env/mock_env.cc b/env/mock_env.cc index 3fdeac2b9ed..3733371fce1 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -8,12 +8,16 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "env/mock_env.h" + #include #include + #include "file/filename.h" #include "port/sys_time.h" +#include "rocksdb/file_system.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" -#include "util/murmurhash.h" +#include "util/hash.h" #include "util/random.h" #include "util/rate_limiter.h" @@ -29,8 +33,7 @@ class MemFile { locked_(false), size_(0), modified_time_(Now()), - rnd_(static_cast( - MurmurHash(fn.data(), static_cast(fn.size()), 0))), + rnd_(Lower32of64(GetSliceNPHash64(fn))), fsynced_bytes_(0) {} // No copying allowed. MemFile(const MemFile&) = delete; @@ -78,7 +81,8 @@ class MemFile { uint64_t Size() const { return size_; } - void Truncate(size_t size) { + void Truncate(size_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { MutexLock lock(&mutex_); if (size < size_) { data_.resize(size); @@ -100,7 +104,17 @@ class MemFile { } } - Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/, + Slice* result, char* scratch, IODebugContext* /*dbg*/) const { + { + IOStatus s; + TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s); + if (!s.ok()) { + // with sync point only + *result = Slice(); + return s; + } + } MutexLock lock(&mutex_); const uint64_t available = Size() - std::min(Size(), offset); size_t offset_ = static_cast(offset); @@ -109,7 +123,7 @@ class MemFile { } if (n == 0) { *result = Slice(); - return Status::OK(); + return IOStatus::OK(); } if (scratch) { memcpy(scratch, &(data_[offset_]), n); @@ -117,10 +131,11 @@ class MemFile { } else { *result = Slice(&(data_[offset_]), n); } - return Status::OK(); + return IOStatus::OK(); } - Status Write(uint64_t offset, const Slice& data) { + IOStatus Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, IODebugContext* /*dbg*/) { MutexLock lock(&mutex_); size_t offset_ = static_cast(offset); if (offset + data.size() > data_.size()) { @@ -129,20 +144,21 @@ class MemFile { data_.replace(offset_, data.size(), data.data(), data.size()); size_ = data_.size(); modified_time_ = Now(); - return Status::OK(); + return IOStatus::OK(); } - Status Append(const Slice& data) { + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { MutexLock lock(&mutex_); data_.append(data.data(), data.size()); size_ = data_.size(); modified_time_ = Now(); - return Status::OK(); + return IOStatus::OK(); } - Status Fsync() { + IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) { fsynced_bytes_ = size_.load(); - return Status::OK(); + return IOStatus::OK(); } uint64_t ModifiedTime() const { return modified_time_; } @@ -177,111 +193,176 @@ class MemFile { namespace { -class MockSequentialFile : public SequentialFile { +class MockSequentialFile : public FSSequentialFile { public: - explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) { + explicit MockSequentialFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads), + pos_(0) { file_->Ref(); } ~MockSequentialFile() override { file_->Unref(); } - Status Read(size_t n, Slice* result, char* scratch) override { - Status s = file_->Read(pos_, n, result, scratch); + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override { + IOStatus s = file_->Read(pos_, n, options, result, + (use_mmap_read_) ? nullptr : scratch, dbg); if (s.ok()) { pos_ += result->size(); } return s; } - Status Skip(uint64_t n) override { + bool use_direct_io() const override { return use_direct_io_; } + IOStatus Skip(uint64_t n) override { if (pos_ > file_->Size()) { - return Status::IOError("pos_ > file_->Size()"); + return IOStatus::IOError("pos_ > file_->Size()"); } const uint64_t available = file_->Size() - pos_; if (n > available) { n = available; } pos_ += static_cast(n); - return Status::OK(); + return IOStatus::OK(); } private: MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; size_t pos_; }; -class MockRandomAccessFile : public RandomAccessFile { +class MockRandomAccessFile : public FSRandomAccessFile { public: - explicit MockRandomAccessFile(MemFile* file) : file_(file) { file_->Ref(); } + explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_reads), + use_mmap_read_(opts.use_mmap_reads) { + file_->Ref(); + } ~MockRandomAccessFile() override { file_->Unref(); } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - return file_->Read(offset, n, result, scratch); + bool use_direct_io() const override { return use_direct_io_; } + + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + if (use_mmap_read_) { + return file_->Read(offset, n, options, result, nullptr, dbg); + } else { + return file_->Read(offset, n, options, result, scratch, dbg); + } } private: MemFile* file_; + bool use_direct_io_; + bool use_mmap_read_; }; -class MockRandomRWFile : public RandomRWFile { +class MockRandomRWFile : public FSRandomRWFile { public: explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); } ~MockRandomRWFile() override { file_->Unref(); } - Status Write(uint64_t offset, const Slice& data) override { - return file_->Write(offset, data); + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { + return file_->Write(offset, data, options, dbg); } - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - return file_->Read(offset, n, result, scratch); + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { + return file_->Read(offset, n, options, result, scratch, dbg); } - Status Close() override { return file_->Fsync(); } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } - Status Flush() override { return Status::OK(); } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Sync() override { return file_->Fsync(); } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } private: MemFile* file_; }; -class MockWritableFile : public WritableFile { +class MockWritableFile : public FSWritableFile { public: - MockWritableFile(MemFile* file, RateLimiter* rate_limiter) - : file_(file), rate_limiter_(rate_limiter) { + MockWritableFile(MemFile* file, const FileOptions& opts) + : file_(file), + use_direct_io_(opts.use_direct_writes), + rate_limiter_(opts.rate_limiter) { file_->Ref(); } ~MockWritableFile() override { file_->Unref(); } - Status Append(const Slice& data) override { + bool use_direct_io() const override { return false && use_direct_io_; } + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override { size_t bytes_written = 0; while (bytes_written < data.size()) { auto bytes = RequestToken(data.size() - bytes_written); - Status s = file_->Append(Slice(data.data() + bytes_written, bytes)); + IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes), + options, dbg); if (!s.ok()) { return s; } bytes_written += bytes; } - return Status::OK(); + return IOStatus::OK(); } - Status Truncate(uint64_t size) override { - file_->Truncate(static_cast(size)); - return Status::OK(); + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/, + const IOOptions& options, + IODebugContext* dbg) override { + assert(use_direct_io_); + return Append(data, options, dbg); } - Status Close() override { return file_->Fsync(); } - Status Flush() override { return Status::OK(); } + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override { + file_->Truncate(static_cast(size), options, dbg); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } - Status Sync() override { return file_->Fsync(); } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - uint64_t GetFileSize() override { return file_->Size(); } + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override { + return file_->Fsync(options, dbg); + } + + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return file_->Size(); + } private: inline size_t RequestToken(size_t bytes) { @@ -294,12 +375,16 @@ class MockWritableFile : public WritableFile { } MemFile* file_; + bool use_direct_io_; RateLimiter* rate_limiter_; }; -class MockEnvDirectory : public Directory { +class MockEnvDirectory : public FSDirectory { public: - Status Fsync() override { return Status::OK(); } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } }; class MockEnvFileLock : public FileLock { @@ -314,21 +399,26 @@ class MockEnvFileLock : public FileLock { class TestMemLogger : public Logger { private: - std::unique_ptr file_; + std::unique_ptr file_; std::atomic_size_t log_size_; static const uint64_t flush_every_seconds_ = 5; std::atomic_uint_fast64_t last_flush_micros_; Env* env_; + IOOptions options_; + IODebugContext* dbg_; std::atomic flush_pending_; public: - TestMemLogger(std::unique_ptr f, Env* env, + TestMemLogger(std::unique_ptr f, Env* env, + const IOOptions& options, IODebugContext* dbg, const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) : Logger(log_level), file_(std::move(f)), log_size_(0), last_flush_micros_(0), env_(env), + options_(options), + dbg_(dbg), flush_pending_(false) {} ~TestMemLogger() override {} @@ -394,7 +484,7 @@ class TestMemLogger : public Logger { assert(p <= limit); const size_t write_size = p - base; - Status s = file_->Append(Slice(base, write_size)); + Status s = file_->Append(Slice(base, write_size), options_, dbg_); if (s.ok()) { flush_pending_ = true; log_size_ += write_size; @@ -414,151 +504,305 @@ class TestMemLogger : public Logger { size_t GetLogFileSize() const override { return log_size_; } }; -} // Anonymous namespace +class MockFileSystem : public FileSystem { + public: + explicit MockFileSystem(Env* env, bool supports_direct_io = true) + : env_(env), supports_direct_io_(supports_direct_io) {} -MockEnv::MockEnv(Env* base_env) : EnvWrapper(base_env), fake_sleep_micros_(0) {} + ~MockFileSystem() override { + for (auto i = file_map_.begin(); i != file_map_.end(); ++i) { + i->second->Unref(); + } + } -MockEnv::~MockEnv() { - for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) { - i->second->Unref(); + const char* Name() const override { return "Memory"; } + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewWritableFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& options, + std::vector* result, + IODebugContext* dbg) override; + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Truncate(const std::string& fname, size_t size, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* /*dbg*/) override { + *output_path = NormalizeMockPath(db_path); + if (output_path->at(0) != '/') { + return IOStatus::NotSupported("GetAbsolutePath"); + } else { + return IOStatus::OK(); + } } -} + IOStatus IsDirectory(const std::string& /*path*/, + const IOOptions& /*options*/, bool* /*is_dir*/, + IODebugContext* /*dgb*/) override { + return IOStatus::NotSupported("IsDirectory"); + } + + Status CorruptBuffer(const std::string& fname); + private: + bool RenameFileInternal(const std::string& src, const std::string& dest); + void DeleteFileInternal(const std::string& fname); + bool GetChildrenInternal(const std::string& fname, + std::vector* results); + + std::string NormalizeMockPath(const std::string& path) { + std::string p = NormalizePath(path); + if (p.back() == kFilePathSeparator && p.size() > 1) { + p.pop_back(); + } + return p; + } + + private: + // Map from filenames to MemFile objects, representing a simple file system. + port::Mutex mutex_; + std::map file_map_; // Protected by mutex_. + Env* env_; + bool supports_direct_io_; +}; + +} // Anonymous namespace // Partial implementation of the Env interface. -Status MockEnv::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& /*soptions*/) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { *result = nullptr; - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } auto* f = file_map_[fn]; if (f->is_lock_file()) { - return Status::InvalidArgument(fn, "Cannot open a lock file."); + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockSequentialFile(f, file_opts)); + return IOStatus::OK(); } - result->reset(new MockSequentialFile(f)); - return Status::OK(); } -Status MockEnv::NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& /*soptions*/) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { *result = nullptr; - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } auto* f = file_map_[fn]; if (f->is_lock_file()) { - return Status::InvalidArgument(fn, "Cannot open a lock file."); + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); + } else if (file_opts.use_direct_reads && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockRandomAccessFile(f, file_opts)); + return IOStatus::OK(); } - result->reset(new MockRandomAccessFile(f)); - return Status::OK(); } -Status MockEnv::NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& /*soptions*/) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewRandomRWFile( + const std::string& fname, const FileOptions& /*file_opts*/, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { *result = nullptr; - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } auto* f = file_map_[fn]; if (f->is_lock_file()) { - return Status::InvalidArgument(fn, "Cannot open a lock file."); + return IOStatus::InvalidArgument(fn, "Cannot open a lock file."); } result->reset(new MockRandomRWFile(f)); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) { - auto s = RenameFile(old_fname, fname); +IOStatus MockFileSystem::ReuseWritableFile( + const std::string& fname, const std::string& old_fname, + const FileOptions& options, std::unique_ptr* result, + IODebugContext* dbg) { + auto s = RenameFile(old_fname, fname, IOOptions(), dbg); if (!s.ok()) { return s; + } else { + result->reset(); + return NewWritableFile(fname, options, result, dbg); } - result->reset(); - return NewWritableFile(fname, result, options); } -Status MockEnv::NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& env_options) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { DeleteFileInternal(fn); } - MemFile* file = new MemFile(this, fn, false); + MemFile* file = new MemFile(env_, fn, false); file->Ref(); file_map_[fn] = file; + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } +} - result->reset(new MockWritableFile(file, env_options.rate_limiter)); - return Status::OK(); +IOStatus MockFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); + MutexLock lock(&mutex_); + MemFile* file = nullptr; + if (file_map_.find(fn) == file_map_.end()) { + file = new MemFile(env_, fn, false); + // Only take a reference when we create the file objectt + file->Ref(); + file_map_[fn] = file; + } else { + file = file_map_[fn]; + } + if (file_opts.use_direct_writes && !supports_direct_io_) { + return IOStatus::NotSupported("Direct I/O Not Supported"); + } else { + result->reset(new MockWritableFile(file, file_opts)); + return IOStatus::OK(); + } } -Status MockEnv::NewDirectory(const std::string& /*name*/, - std::unique_ptr* result) { +IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/, + const IOOptions& /*io_opts*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { result->reset(new MockEnvDirectory()); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::FileExists(const std::string& fname) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::FileExists(const std::string& fname, + const IOOptions& /*io_opts*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { // File exists - return Status::OK(); + return IOStatus::OK(); } // Now also check if fn exists as a dir for (const auto& iter : file_map_) { const std::string& filename = iter.first; if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' && Slice(filename).starts_with(Slice(fn))) { - return Status::OK(); + return IOStatus::OK(); } } - return Status::NotFound(); + return IOStatus::NotFound(); } -Status MockEnv::GetChildren(const std::string& dir, - std::vector* result) { - auto d = NormalizePath(dir); +bool MockFileSystem::GetChildrenInternal(const std::string& dir, + std::vector* result) { + auto d = NormalizeMockPath(dir); bool found_dir = false; - { - MutexLock lock(&mutex_); - result->clear(); - for (const auto& iter : file_map_) { - const std::string& filename = iter.first; - - if (filename == d) { - found_dir = true; - } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && - Slice(filename).starts_with(Slice(d))) { - found_dir = true; - size_t next_slash = filename.find('/', d.size() + 1); - if (next_slash != std::string::npos) { - result->push_back( - filename.substr(d.size() + 1, next_slash - d.size() - 1)); - } else { - result->push_back(filename.substr(d.size() + 1)); - } + result->clear(); + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + + if (filename == d) { + found_dir = true; + } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && + Slice(filename).starts_with(Slice(d))) { + found_dir = true; + size_t next_slash = filename.find('/', d.size() + 1); + if (next_slash != std::string::npos) { + result->push_back( + filename.substr(d.size() + 1, next_slash - d.size() - 1)); + } else { + result->push_back(filename.substr(d.size() + 1)); } } } result->erase(std::unique(result->begin(), result->end()), result->end()); - return found_dir ? Status::OK() : Status::NotFound(); + return found_dir; +} + +IOStatus MockFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*options*/, + std::vector* result, + IODebugContext* /*dbg*/) { + MutexLock lock(&mutex_); + bool found_dir = GetChildrenInternal(dir, result); + return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir); } -void MockEnv::DeleteFileInternal(const std::string& fname) { - assert(fname == NormalizePath(fname)); +void MockFileSystem::DeleteFileInternal(const std::string& fname) { + assert(fname == NormalizeMockPath(fname)); const auto& pair = file_map_.find(fname); if (pair != file_map_.end()) { pair->second->Unref(); @@ -566,180 +810,222 @@ void MockEnv::DeleteFileInternal(const std::string& fname) { } } -Status MockEnv::DeleteFile(const std::string& fname) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); if (file_map_.find(fn) == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } DeleteFileInternal(fn); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::Truncate(const std::string& fname, size_t size) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& options, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } - iter->second->Truncate(size); - return Status::OK(); + iter->second->Truncate(size, options, dbg); + return IOStatus::OK(); } -Status MockEnv::CreateDir(const std::string& dirname) { - auto dn = NormalizePath(dirname); +IOStatus MockFileSystem::CreateDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dn = NormalizeMockPath(dirname); MutexLock lock(&mutex_); if (file_map_.find(dn) == file_map_.end()) { - MemFile* file = new MemFile(this, dn, false); + MemFile* file = new MemFile(env_, dn, false); file->Ref(); file_map_[dn] = file; } else { - return Status::IOError(); + return IOStatus::IOError(); } - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::CreateDirIfMissing(const std::string& dirname) { - CreateDir(dirname).PermitUncheckedError(); - return Status::OK(); +IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) { + CreateDir(dirname, options, dbg).PermitUncheckedError(); + return IOStatus::OK(); } -Status MockEnv::DeleteDir(const std::string& dirname) { - return DeleteFile(dirname); +IOStatus MockFileSystem::DeleteDir(const std::string& dirname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto dir = NormalizeMockPath(dirname); + MutexLock lock(&mutex_); + if (file_map_.find(dir) == file_map_.end()) { + return IOStatus::PathNotFound(dir); + } else { + std::vector children; + if (GetChildrenInternal(dir, &children)) { + for (const auto& child : children) { + DeleteFileInternal(child); + } + } + DeleteFileInternal(dir); + return IOStatus::OK(); + } } -Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* file_size, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } *file_size = iter->second->Size(); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::GetFileModificationTime(const std::string& fname, - uint64_t* time) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*options*/, + uint64_t* time, + IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { - return Status::IOError(fn, "File not found"); + return IOStatus::PathNotFound(fn); } *time = iter->second->ModifiedTime(); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::RenameFile(const std::string& src, const std::string& dest) { - auto s = NormalizePath(src); - auto t = NormalizePath(dest); - MutexLock lock(&mutex_); - if (file_map_.find(s) == file_map_.end()) { - return Status::IOError(s, "File not found"); +bool MockFileSystem::RenameFileInternal(const std::string& src, + const std::string& dest) { + if (file_map_.find(src) == file_map_.end()) { + return false; + } else { + std::vector children; + if (GetChildrenInternal(src, &children)) { + for (const auto& child : children) { + RenameFileInternal(src + "/" + child, dest + "/" + child); + } + } + DeleteFileInternal(dest); + file_map_[dest] = file_map_[src]; + file_map_.erase(src); + return true; } +} - DeleteFileInternal(t); - file_map_[t] = file_map_[s]; - file_map_.erase(s); - return Status::OK(); +IOStatus MockFileSystem::RenameFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); + MutexLock lock(&mutex_); + bool found = RenameFileInternal(s, t); + if (!found) { + return IOStatus::PathNotFound(s); + } else { + return IOStatus::OK(); + } } -Status MockEnv::LinkFile(const std::string& src, const std::string& dest) { - auto s = NormalizePath(src); - auto t = NormalizePath(dest); +IOStatus MockFileSystem::LinkFile(const std::string& src, + const std::string& dest, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + auto s = NormalizeMockPath(src); + auto t = NormalizeMockPath(dest); MutexLock lock(&mutex_); if (file_map_.find(s) == file_map_.end()) { - return Status::IOError(s, "File not found"); + return IOStatus::PathNotFound(s); } DeleteFileInternal(t); file_map_[t] = file_map_[s]; file_map_[t]->Ref(); // Otherwise it might get deleted when noone uses s - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::NewLogger(const std::string& fname, - std::shared_ptr* result) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::NewLogger(const std::string& fname, + const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); MemFile* file = nullptr; if (iter == file_map_.end()) { - file = new MemFile(this, fn, false); + file = new MemFile(env_, fn, false); file->Ref(); file_map_[fn] = file; } else { file = iter->second; } - std::unique_ptr f(new MockWritableFile(file, nullptr)); - result->reset(new TestMemLogger(std::move(f), this)); - return Status::OK(); + std::unique_ptr f(new MockWritableFile(file, FileOptions())); + result->reset(new TestMemLogger(std::move(f), env_, io_opts, dbg)); + return IOStatus::OK(); } -Status MockEnv::LockFile(const std::string& fname, FileLock** flock) { - auto fn = NormalizePath(fname); +IOStatus MockFileSystem::LockFile(const std::string& fname, + const IOOptions& /*options*/, + FileLock** flock, IODebugContext* /*dbg*/) { + auto fn = NormalizeMockPath(fname); { MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { if (!file_map_[fn]->is_lock_file()) { - return Status::InvalidArgument(fname, "Not a lock file."); + return IOStatus::InvalidArgument(fname, "Not a lock file."); } if (!file_map_[fn]->Lock()) { - return Status::IOError(fn, "Lock is already held."); + return IOStatus::IOError(fn, "lock is already held."); } } else { - auto* file = new MemFile(this, fn, true); + auto* file = new MemFile(env_, fn, true); file->Ref(); file->Lock(); file_map_[fn] = file; } } *flock = new MockEnvFileLock(fn); - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::UnlockFile(FileLock* flock) { +IOStatus MockFileSystem::UnlockFile(FileLock* flock, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { std::string fn = static_cast_with_check(flock)->FileName(); { MutexLock lock(&mutex_); if (file_map_.find(fn) != file_map_.end()) { if (!file_map_[fn]->is_lock_file()) { - return Status::InvalidArgument(fn, "Not a lock file."); + return IOStatus::InvalidArgument(fn, "Not a lock file."); } file_map_[fn]->Unlock(); } } delete flock; - return Status::OK(); + return IOStatus::OK(); } -Status MockEnv::GetTestDirectory(std::string* path) { +IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/, + std::string* path, + IODebugContext* /*dbg*/) { *path = "/test"; - return Status::OK(); -} - -Status MockEnv::GetCurrentTime(int64_t* unix_time) { - auto s = EnvWrapper::GetCurrentTime(unix_time); - if (s.ok()) { - *unix_time += fake_sleep_micros_.load() / (1000 * 1000); - } - return s; + return IOStatus::OK(); } -uint64_t MockEnv::NowMicros() { - return EnvWrapper::NowMicros() + fake_sleep_micros_.load(); -} - -uint64_t MockEnv::NowNanos() { - return EnvWrapper::NowNanos() + fake_sleep_micros_.load() * 1000; -} - -Status MockEnv::CorruptBuffer(const std::string& fname) { - auto fn = NormalizePath(fname); +Status MockFileSystem::CorruptBuffer(const std::string& fname) { + auto fn = NormalizeMockPath(fname); MutexLock lock(&mutex_); auto iter = file_map_.find(fn); if (iter == file_map_.end()) { @@ -748,9 +1034,52 @@ Status MockEnv::CorruptBuffer(const std::string& fname) { iter->second->CorruptBuffer(); return Status::OK(); } +namespace { +class MockSystemClock : public SystemClockWrapper { + public: + explicit MockSystemClock(const std::shared_ptr& c) + : SystemClockWrapper(c), fake_sleep_micros_(0) {} + + void FakeSleepForMicroseconds(int64_t micros) { + fake_sleep_micros_.fetch_add(micros); + } + + const char* Name() const override { return "MockSystemClock"; } + + Status GetCurrentTime(int64_t* unix_time) override { + auto s = SystemClockWrapper::GetCurrentTime(unix_time); + if (s.ok()) { + auto fake_time = fake_sleep_micros_.load() / (1000 * 1000); + *unix_time += fake_time; + } + return s; + } + + uint64_t NowMicros() override { + return SystemClockWrapper::NowMicros() + fake_sleep_micros_.load(); + } + + uint64_t NowNanos() override { + return SystemClockWrapper::NowNanos() + fake_sleep_micros_.load() * 1000; + } + + private: + std::atomic fake_sleep_micros_; +}; +} // namespace +MockEnv::MockEnv(Env* base_env) + : CompositeEnvWrapper( + base_env, std::make_shared(this), + std::make_shared(base_env->GetSystemClock())) {} + +Status MockEnv::CorruptBuffer(const std::string& fname) { + auto mock = static_cast_with_check(GetFileSystem().get()); + return mock->CorruptBuffer(fname); +} void MockEnv::FakeSleepForMicroseconds(int64_t micros) { - fake_sleep_micros_.fetch_add(micros); + auto mock = static_cast_with_check(GetSystemClock().get()); + mock->FakeSleepForMicroseconds(micros); } #ifndef ROCKSDB_LITE diff --git a/env/mock_env.h b/env/mock_env.h index 1ed5c0b1f73..5e7faf55b85 100644 --- a/env/mock_env.h +++ b/env/mock_env.h @@ -12,93 +12,17 @@ #include #include #include + +#include "env/composite_env_wrapper.h" #include "rocksdb/env.h" #include "rocksdb/status.h" -#include "port/port.h" -#include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { -class MemFile; -class MockEnv : public EnvWrapper { +class MockEnv : public CompositeEnvWrapper { public: explicit MockEnv(Env* base_env); - ~MockEnv() override; - - // Partial implementation of the Env interface. - Status RegisterDbPaths(const std::vector& /*paths*/) override { - return Status::OK(); - } - - Status UnregisterDbPaths(const std::vector& /*paths*/) override { - return Status::OK(); - } - - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override; - - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& env_options) override; - - Status NewDirectory(const std::string& name, - std::unique_ptr* result) override; - - Status FileExists(const std::string& fname) override; - - Status GetChildren(const std::string& dir, - std::vector* result) override; - - void DeleteFileInternal(const std::string& fname); - - Status DeleteFile(const std::string& fname) override; - - Status Truncate(const std::string& fname, size_t size) override; - - Status CreateDir(const std::string& dirname) override; - - Status CreateDirIfMissing(const std::string& dirname) override; - - Status DeleteDir(const std::string& dirname) override; - - Status GetFileSize(const std::string& fname, uint64_t* file_size) override; - - Status GetFileModificationTime(const std::string& fname, - uint64_t* time) override; - - Status RenameFile(const std::string& src, const std::string& target) override; - - Status LinkFile(const std::string& src, const std::string& target) override; - - Status NewLogger(const std::string& fname, - std::shared_ptr* result) override; - - Status LockFile(const std::string& fname, FileLock** flock) override; - - Status UnlockFile(FileLock* flock) override; - - Status GetTestDirectory(std::string* path) override; - - // Results of these can be affected by FakeSleepForMicroseconds() - Status GetCurrentTime(int64_t* unix_time) override; - uint64_t NowMicros() override; - uint64_t NowNanos() override; - Status CorruptBuffer(const std::string& fname); // Doesn't really sleep, just affects output of GetCurrentTime(), NowMicros() @@ -106,12 +30,6 @@ class MockEnv : public EnvWrapper { void FakeSleepForMicroseconds(int64_t micros); private: - // Map from filenames to MemFile objects, representing a simple file system. - typedef std::map FileSystem; - port::Mutex mutex_; - FileSystem file_map_; // Protected by mutex_. - - std::atomic fake_sleep_micros_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/examples/Makefile b/examples/Makefile index 27a6f0f421a..faee6f06bfd 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -12,6 +12,8 @@ ifneq ($(USE_RTTI), 1) CXXFLAGS += -fno-rtti endif +CFLAGS += -Wstrict-prototypes + .PHONY: clean librocksdb all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 5a032837e88..5f9994074a6 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -15,17 +15,19 @@ #include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { -DeleteScheduler::DeleteScheduler(Env* env, FileSystem* fs, +DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs, int64_t rate_bytes_per_sec, Logger* info_log, SstFileManagerImpl* sst_file_manager, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) - : env_(env), + : clock_(clock), fs_(fs), total_trash_size_(0), rate_bytes_per_sec_(rate_bytes_per_sec), @@ -51,19 +53,21 @@ DeleteScheduler::~DeleteScheduler() { if (bg_thread_) { bg_thread_->join(); } + for (const auto& it : bg_errors_) { + it.second.PermitUncheckedError(); + } } Status DeleteScheduler::DeleteFile(const std::string& file_path, const std::string& dir_to_sync, const bool force_bg) { - Status s; if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && total_trash_size_.load() > sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { // Rate limiting is disabled or trash size makes up more than // max_trash_db_ratio_ (default 25%) of the total DB size TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); - s = fs_->DeleteFile(file_path, IOOptions(), nullptr); + Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr); if (s.ok()) { s = sst_file_manager_->OnDeleteFile(file_path); ROCKS_LOG_INFO(info_log_, @@ -79,7 +83,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, // Move file to trash std::string trash_file; - s = MarkAsTrash(file_path, &trash_file); + Status s = MarkAsTrash(file_path, &trash_file); ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(), s.ToString().c_str()); @@ -99,8 +103,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path, // Update the total trash size uint64_t trash_file_size = 0; - fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); - total_trash_size_.fetch_add(trash_file_size); + IOStatus io_s = + fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr); + if (io_s.ok()) { + total_trash_size_.fetch_add(trash_file_size); + } + //**TODO: What should we do if we failed to + // get the file size? // Add file to delete queue { @@ -169,17 +178,17 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path, return Status::InvalidArgument("file_path is corrupted"); } - Status s; if (DeleteScheduler::IsTrashFile(file_path)) { // This is already a trash file *trash_file = file_path; - return s; + return Status::OK(); } *trash_file = file_path + kTrashExtension; // TODO(tec) : Implement Env::RenameFileIfNotExist and remove // file_move_mu mutex. int cnt = 0; + Status s; InstrumentedMutexLock l(&file_move_mu_); while (true) { s = fs_->FileExists(*trash_file, IOOptions(), nullptr); @@ -197,7 +206,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path, cnt++; } if (s.ok()) { - sst_file_manager_->OnMoveFile(file_path, *trash_file); + s = sst_file_manager_->OnMoveFile(file_path, *trash_file); } return s; } @@ -216,14 +225,14 @@ void DeleteScheduler::BackgroundEmptyTrash() { } // Delete all files in queue_ - uint64_t start_time = env_->NowMicros(); + uint64_t start_time = clock_->NowMicros(); uint64_t total_deleted_bytes = 0; int64_t current_delete_rate = rate_bytes_per_sec_.load(); while (!queue_.empty() && !closing_) { if (current_delete_rate != rate_bytes_per_sec_.load()) { // User changed the delete rate current_delete_rate = rate_bytes_per_sec_.load(); - start_time = env_->NowMicros(); + start_time = clock_->NowMicros(); total_deleted_bytes = 0; ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64, current_delete_rate); diff --git a/file/delete_scheduler.h b/file/delete_scheduler.h index b2d17a73e12..6d3f6b4a4f4 100644 --- a/file/delete_scheduler.h +++ b/file/delete_scheduler.h @@ -15,26 +15,28 @@ #include "monitoring/instrumented_mutex.h" #include "port/port.h" -#include "rocksdb/file_system.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { class Env; +class FileSystem; class Logger; class SstFileManagerImpl; +class SystemClock; // DeleteScheduler allows the DB to enforce a rate limit on file deletion, // Instead of deleteing files immediately, files are marked as trash -// and deleted in a background thread that apply sleep penlty between deletes +// and deleted in a background thread that apply sleep penalty between deletes // if they are happening in a rate faster than rate_bytes_per_sec, // // Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this // case DeleteScheduler will delete files immediately. class DeleteScheduler { public: - DeleteScheduler(Env* env, FileSystem* fs, int64_t rate_bytes_per_sec, - Logger* info_log, SstFileManagerImpl* sst_file_manager, + DeleteScheduler(SystemClock* clock, FileSystem* fs, + int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); ~DeleteScheduler(); @@ -48,7 +50,7 @@ class DeleteScheduler { MaybeCreateBackgroundThread(); } - // Mark file as trash directory and schedule it's deletion. If force_bg is + // Mark file as trash directory and schedule its deletion. If force_bg is // set, it forces the file to always be deleted in the background thread, // except when rate limiting is disabled Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, @@ -78,7 +80,7 @@ class DeleteScheduler { static const std::string kTrashExtension; static bool IsTrashFile(const std::string& file_path); - // Check if there are any .trash filse in path, and schedule their deletion + // Check if there are any .trash files in path, and schedule their deletion // Or delete immediately if sst_file_manager is nullptr static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm, const std::string& path); @@ -99,7 +101,7 @@ class DeleteScheduler { void MaybeCreateBackgroundThread(); - Env* env_; + SystemClock* clock_; FileSystem* fs_; // total size of trash files diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index 67eaa50e613..e6f590a526b 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -10,7 +10,6 @@ #include #include -#include "env/composite_env_wrapper.h" #include "file/file_util.h" #include "file/sst_file_manager_impl.h" #include "rocksdb/env.h" @@ -58,7 +57,7 @@ class DeleteSchedulerTest : public testing::Test { int normal_cnt = 0; for (auto& f : files_in_dir) { - if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") { + if (!DeleteScheduler::IsTrashFile(f)) { normal_cnt++; } } @@ -88,7 +87,7 @@ class DeleteSchedulerTest : public testing::Test { std::string data(size, 'A'); EXPECT_OK(f->Append(data)); EXPECT_OK(f->Close()); - sst_file_mgr_->OnAddFile(file_path, false); + sst_file_mgr_->OnAddFile(file_path); return file_path; } @@ -96,10 +95,9 @@ class DeleteSchedulerTest : public testing::Test { // Tests in this file are for DeleteScheduler component and don't create any // DBs, so we need to set max_trash_db_ratio to 100% (instead of default // 25%) - std::shared_ptr - fs(std::make_shared(env_)); sst_file_mgr_.reset( - new SstFileManagerImpl(env_, fs, nullptr, rate_bytes_per_sec_, + new SstFileManagerImpl(env_->GetSystemClock(), env_->GetFileSystem(), + nullptr, rate_bytes_per_sec_, /* max_trash_db_ratio= */ 1.1, 128 * 1024)); delete_scheduler_ = sst_file_mgr_->delete_scheduler(); sst_file_mgr_->SetStatisticsPtr(stats_); @@ -426,7 +424,9 @@ TEST_F(DeleteSchedulerTest, BackgroundError) { delete_scheduler_->WaitForEmptyTrash(); auto bg_errors = delete_scheduler_->GetBackgroundErrors(); ASSERT_EQ(bg_errors.size(), 10); - + for (const auto& it : bg_errors) { + ASSERT_TRUE(it.second.IsPathNotFound()); + } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -670,7 +670,7 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) { } for (std::string& file_name : generated_files) { - delete_scheduler_->DeleteFile(file_name, ""); + ASSERT_OK(delete_scheduler_->DeleteFile(file_name, "")); } // When we end up with 26 files in trash we will start diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index 8d9798d09eb..1fe5a367ee0 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -91,23 +91,26 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, size_t read_len = static_cast(roundup_len - chunk_len); s = reader->Read(opts, rounddown_offset + chunk_len, read_len, &result, buffer_.BufferStart() + chunk_len, nullptr, for_compaction); + if (!s.ok()) { + return s; + } + #ifndef NDEBUG - if (!s.ok() || result.size() < read_len) { + if (result.size() < read_len) { // Fake an IO error to force db_stress fault injection to ignore // truncated read errors IGNORE_STATUS_IF_ERROR(Status::IOError()); } #endif - if (s.ok()) { - buffer_offset_ = rounddown_offset; - buffer_.Size(static_cast(chunk_len) + result.size()); - } + buffer_offset_ = rounddown_offset; + buffer_.Size(static_cast(chunk_len) + result.size()); return s; } bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, uint64_t offset, size_t n, - Slice* result, bool for_compaction) { + Slice* result, Status* status, + bool for_compaction) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -116,7 +119,7 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, } // If the buffer contains only a few of the requested bytes: - // If readahead is enabled: prefetch the remaining bytes + readadhead bytes + // If readahead is enabled: prefetch the remaining bytes + readahead bytes // and satisfy the request. // If readahead is not enabled: return false. if (offset + n > buffer_offset_ + buffer_.CurrentSize()) { @@ -128,10 +131,34 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, s = Prefetch(opts, file_reader_, offset, std::max(n, readahead_size_), for_compaction); } else { + if (implicit_auto_readahead_) { + // Prefetch only if this read is sequential otherwise reset + // readahead_size_ to initial value. + if (!IsBlockSequential(offset)) { + UpdateReadPattern(offset, n); + ResetValues(); + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } + num_file_reads_++; + if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) { + UpdateReadPattern(offset, n); + // Ignore status as Prefetch is not called. + s.PermitUncheckedError(); + return false; + } + } s = Prefetch(opts, file_reader_, offset, n + readahead_size_, for_compaction); } if (!s.ok()) { + if (status) { + *status = s; + } +#ifndef NDEBUG + IGNORE_STATUS_IF_ERROR(s); +#endif return false; } readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); @@ -139,7 +166,7 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, return false; } } - + UpdateReadPattern(offset, n); uint64_t offset_in_buffer = offset - buffer_offset_; *result = Slice(buffer_.BufferStart() + offset_in_buffer, n); return true; diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index d8e9c0ff6ea..980496d285e 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -23,6 +23,7 @@ namespace ROCKSDB_NAMESPACE { // FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: + static const int kMinNumFileReadsToStartAutoReadahead = 2; // Constructor. // // All arguments are optional. @@ -38,23 +39,31 @@ class FilePrefetchBuffer { // for the minimum offset if track_min_offset = true. // track_min_offset : Track the minimum offset ever read and collect stats on // it. Used for adaptable readahead of the file footer/metadata. + // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after + // doing sequential scans for two times. // // Automatic readhead is enabled for a file if file_reader, readahead_size, // and max_readahead_size are passed in. - // If file_reader is a nullptr, setting readadhead_size and max_readahead_size + // If file_reader is a nullptr, setting readahead_size and max_readahead_size // does not make any sense. So it does nothing. // A user can construct a FilePrefetchBuffer without any arguments, but use // `Prefetch` to load data into the buffer. FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr, - size_t readadhead_size = 0, size_t max_readahead_size = 0, - bool enable = true, bool track_min_offset = false) + size_t readahead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false, + bool implicit_auto_readahead = false) : buffer_offset_(0), file_reader_(file_reader), - readahead_size_(readadhead_size), + readahead_size_(readahead_size), max_readahead_size_(max_readahead_size), + initial_readahead_size_(readahead_size), min_offset_read_(port::kMaxSizet), enable_(enable), - track_min_offset_(track_min_offset) {} + track_min_offset_(track_min_offset), + implicit_auto_readahead_(implicit_auto_readahead), + prev_offset_(0), + prev_len_(0), + num_file_reads_(kMinNumFileReadsToStartAutoReadahead + 1) {} // Load data into the buffer from a file. // reader : the file reader. @@ -67,7 +76,7 @@ class FilePrefetchBuffer { // Tries returning the data for a file raed from this buffer, if that data is // in the buffer. // It handles tracking the minimum read offset if track_min_offset = true. - // It also does the exponential readahead when readadhead_size is set as part + // It also does the exponential readahead when readahead_size is set as part // of the constructor. // // offset : the file offset. @@ -75,18 +84,33 @@ class FilePrefetchBuffer { // result : output buffer to put the data into. // for_compaction : if cache read is done for compaction read. bool TryReadFromCache(const IOOptions& opts, uint64_t offset, size_t n, - Slice* result, bool for_compaction = false); + Slice* result, Status* s, bool for_compaction = false); // The minimum `offset` ever passed to TryReadFromCache(). This will nly be // tracked if track_min_offset = true. size_t min_offset_read() const { return min_offset_read_; } + void UpdateReadPattern(const size_t& offset, const size_t& len) { + prev_offset_ = offset; + prev_len_ = len; + } + + bool IsBlockSequential(const size_t& offset) { + return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); + } + + void ResetValues() { + num_file_reads_ = 1; + readahead_size_ = initial_readahead_size_; + } + private: AlignedBuffer buffer_; uint64_t buffer_offset_; RandomAccessFileReader* file_reader_; size_t readahead_size_; size_t max_readahead_size_; + size_t initial_readahead_size_; // The minimum `offset` ever passed to TryReadFromCache(). size_t min_offset_read_; // if false, TryReadFromCache() always return false, and we only take stats @@ -95,5 +119,12 @@ class FilePrefetchBuffer { // If true, track minimum `offset` ever passed to TryReadFromCache(), which // can be fetched from min_offset_read(). bool track_min_offset_; + + // implicit_auto_readahead is enabled by rocksdb internally after 2 sequential + // IOs. + bool implicit_auto_readahead_; + size_t prev_offset_; + size_t prev_len_; + int num_file_reads_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/file_util.cc b/file/file_util.cc index 69f357a1ad4..70191a7f377 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -124,15 +124,17 @@ bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) { } // requested_checksum_func_name brings the function name of the checksum -// generator in checksum_factory. Checksum factories may use or ignore -// requested_checksum_func_name. +// generator in checksum_factory. Empty string is permitted, in which case the +// name of the generator created by the factory is unchecked. When +// `requested_checksum_func_name` is non-empty, however, the created generator's +// name must match it, otherwise an `InvalidArgument` error is returned. IOStatus GenerateOneFileChecksum( FileSystem* fs, const std::string& file_path, FileChecksumGenFactory* checksum_factory, const std::string& requested_checksum_func_name, std::string* file_checksum, std::string* file_checksum_func_name, size_t verify_checksums_readahead_size, bool allow_mmap_reads, - std::shared_ptr& io_tracer) { + std::shared_ptr& io_tracer, RateLimiter* rate_limiter) { if (checksum_factory == nullptr) { return IOStatus::InvalidArgument("Checksum factory is invalid"); } @@ -151,14 +153,22 @@ IOStatus GenerateOneFileChecksum( requested_checksum_func_name + " from checksum factory: " + checksum_factory->Name(); return IOStatus::InvalidArgument(msg); + } else { + // For backward compatibility and use in file ingestion clients where there + // is no stored checksum function name, `requested_checksum_func_name` can + // be empty. If we give the requested checksum function name, we expect it + // is the same name of the checksum generator. + if (!requested_checksum_func_name.empty() && + checksum_generator->Name() != requested_checksum_func_name) { + std::string msg = "Expected file checksum generator named '" + + requested_checksum_func_name + + "', while the factory created one " + "named '" + + checksum_generator->Name() + "'"; + return IOStatus::InvalidArgument(msg); + } } - // For backward compatable, requested_checksum_func_name can be empty. - // If we give the requested checksum function name, we expect it is the - // same name of the checksum generator. - assert(!checksum_generator || requested_checksum_func_name.empty() || - requested_checksum_func_name == checksum_generator->Name()); - uint64_t size; IOStatus io_s; std::unique_ptr reader; @@ -173,7 +183,8 @@ IOStatus GenerateOneFileChecksum( return io_s; } reader.reset(new RandomAccessFileReader(std::move(r_file), file_path, - nullptr /*Env*/, io_tracer)); + nullptr /*Env*/, io_tracer, nullptr, + 0, nullptr, rate_limiter)); } // Found that 256 KB readahead size provides the best performance, based on @@ -184,7 +195,7 @@ IOStatus GenerateOneFileChecksum( : default_max_read_ahead_size; FilePrefetchBuffer prefetch_buffer( - reader.get(), readahead_size /* readadhead_size */, + reader.get(), readahead_size /* readahead_size */, readahead_size /* max_readahead_size */, !allow_mmap_reads /* enable */); Slice slice; @@ -194,7 +205,7 @@ IOStatus GenerateOneFileChecksum( size_t bytes_to_read = static_cast(std::min(uint64_t{readahead_size}, size)); if (!prefetch_buffer.TryReadFromCache(opts, offset, bytes_to_read, &slice, - false)) { + nullptr, false)) { return IOStatus::Corruption("file read failed"); } if (slice.size() == 0) { @@ -219,9 +230,6 @@ Status DestroyDir(Env* env, const std::string& dir) { s = env->GetChildren(dir, &files_in_dir); if (s.ok()) { for (auto& file_in_dir : files_in_dir) { - if (file_in_dir == "." || file_in_dir == "..") { - continue; - } std::string path = dir + "/" + file_in_dir; bool is_dir = false; s = env->IsDirectory(path, &is_dir); @@ -231,6 +239,8 @@ Status DestroyDir(Env* env, const std::string& dir) { } else { s = env->DeleteFile(path); } + } else if (s.IsNotSupported()) { + s = Status::OK(); } if (!s.ok()) { // IsDirectory, etc. might not report NotFound diff --git a/file/file_util.h b/file/file_util.h index a9b0a95095d..48878833f41 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -12,6 +12,7 @@ #include "rocksdb/file_system.h" #include "rocksdb/sst_file_writer.h" #include "rocksdb/status.h" +#include "rocksdb/system_clock.h" #include "rocksdb/types.h" #include "trace_replay/io_tracer.h" @@ -22,10 +23,23 @@ extern IOStatus CopyFile(FileSystem* fs, const std::string& source, const std::string& destination, uint64_t size, bool use_fsync, const std::shared_ptr& io_tracer = nullptr); +inline IOStatus CopyFile(const std::shared_ptr& fs, + const std::string& source, + const std::string& destination, uint64_t size, + bool use_fsync, + const std::shared_ptr& io_tracer = nullptr) { + return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer); +} extern IOStatus CreateFile(FileSystem* fs, const std::string& destination, const std::string& contents, bool use_fsync); +inline IOStatus CreateFile(const std::shared_ptr& fs, + const std::string& destination, + const std::string& contents, bool use_fsync) { + return CreateFile(fs.get(), destination, contents, use_fsync); +} + extern Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& path_to_sync, const bool force_bg, @@ -39,16 +53,26 @@ extern IOStatus GenerateOneFileChecksum( const std::string& requested_checksum_func_name, std::string* file_checksum, std::string* file_checksum_func_name, size_t verify_checksums_readahead_size, bool allow_mmap_reads, - std::shared_ptr& io_tracer); + std::shared_ptr& io_tracer, RateLimiter* rate_limiter = nullptr); -inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, Env* env, - IOOptions& opts) { - if (!env) { - env = Env::Default(); - } +inline IOStatus GenerateOneFileChecksum( + const std::shared_ptr& fs, const std::string& file_path, + FileChecksumGenFactory* checksum_factory, + const std::string& requested_checksum_func_name, std::string* file_checksum, + std::string* file_checksum_func_name, + size_t verify_checksums_readahead_size, bool allow_mmap_reads, + std::shared_ptr& io_tracer) { + return GenerateOneFileChecksum( + fs.get(), file_path, checksum_factory, requested_checksum_func_name, + file_checksum, file_checksum_func_name, verify_checksums_readahead_size, + allow_mmap_reads, io_tracer); +} +inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, + SystemClock* clock, IOOptions& opts) { if (ro.deadline.count()) { - std::chrono::microseconds now = std::chrono::microseconds(env->NowMicros()); + std::chrono::microseconds now = + std::chrono::microseconds(clock->NowMicros()); // Ensure there is atleast 1us available. We don't want to pass a value of // 0 as that means no timeout if (now >= ro.deadline) { diff --git a/file/filename.cc b/file/filename.cc index a7c22d2e773..87bf060d1c2 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -13,7 +13,6 @@ #include #include #include "file/writable_file_writer.h" -#include "logging/logging.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" #include "util/stop_watch.h" @@ -184,7 +183,8 @@ InfoLogPrefix::InfoLogPrefix(bool has_log_dir, snprintf(buf, sizeof(buf), kInfoLogPrefix); prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1); } else { - size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf)); + size_t len = + GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf)); prefix = Slice(buf, len); } } @@ -352,7 +352,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number, Slice suffix = rest; if (suffix == Slice("log")) { - *type = kLogFile; + *type = kWalFile; if (log_type && !archive_dir_found) { *log_type = kAliveLogFile; } @@ -383,10 +383,12 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname, contents.remove_prefix(dbname.size() + 1); std::string tmp = TempFileName(dbname, descriptor_number); IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true); + TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s); if (s.ok()) { - TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2); s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr); - TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2); + TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s); } if (s.ok()) { if (directory_to_fsync != nullptr) { @@ -419,20 +421,21 @@ Status SetIdentityFile(Env* env, const std::string& dbname, return s; } -IOStatus SyncManifest(Env* env, const ImmutableDBOptions* db_options, +IOStatus SyncManifest(const ImmutableDBOptions* db_options, WritableFileWriter* file) { - TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2); - StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); + TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2); + StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS); return file->Sync(db_options->use_fsync); } -Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, - const std::string& dbname, std::string* parent_dir, +Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, const std::string& dbname, + std::string* parent_dir, std::vector* info_log_list) { assert(parent_dir != nullptr); assert(info_log_list != nullptr); uint64_t number = 0; - FileType type = kLogFile; + FileType type = kWalFile; if (!db_log_dir.empty()) { *parent_dir = db_log_dir; @@ -443,7 +446,7 @@ Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname); std::vector file_names; - Status s = env->GetChildren(*parent_dir, &file_names); + Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr); if (!s.ok()) { return s; diff --git a/file/filename.h b/file/filename.h index f23723244c7..7f34ade281b 100644 --- a/file/filename.h +++ b/file/filename.h @@ -27,6 +27,7 @@ namespace ROCKSDB_NAMESPACE { class Env; class Directory; +class SystemClock; class WritableFileWriter; #ifdef OS_WIN @@ -35,20 +36,6 @@ const char kFilePathSeparator = '\\'; const char kFilePathSeparator = '/'; #endif -enum FileType { - kLogFile, - kDBLockFile, - kTableFile, - kDescriptorFile, - kCurrentFile, - kTempFile, - kInfoLogFile, // Either the current one, or an old one - kMetaDatabase, - kIdentityFile, - kOptionsFile, - kBlobFile -}; - // Return the name of the log file with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". @@ -180,14 +167,15 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname, const std::string& db_id = {}); // Sync manifest file `file`. -extern IOStatus SyncManifest(Env* env, const ImmutableDBOptions* db_options, +extern IOStatus SyncManifest(const ImmutableDBOptions* db_options, WritableFileWriter* file); // Return list of file names of info logs in `file_names`. // The list only contains file name. The parent directory name is stored // in `parent_dir`. // `db_log_dir` should be the one as in options.db_log_dir -extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, +extern Status GetInfoLogFiles(const std::shared_ptr& fs, + const std::string& db_log_dir, const std::string& dbname, std::string* parent_dir, std::vector* file_names); diff --git a/file/line_file_reader.cc b/file/line_file_reader.cc new file mode 100644 index 00000000000..8a56a09b2e4 --- /dev/null +++ b/file/line_file_reader.cc @@ -0,0 +1,65 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "file/line_file_reader.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +Status LineFileReader::Create(const std::shared_ptr& fs, + const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg) { + std::unique_ptr file; + Status s = fs->NewSequentialFile(fname, file_opts, &file, dbg); + if (s.ok()) { + reader->reset(new LineFileReader(std::move(file), fname)); + } + return s; +} + +bool LineFileReader::ReadLine(std::string* out) { + assert(out); + if (!status_.ok()) { + // Status should be checked (or permit unchecked) any time we return false. + status_.MustCheck(); + return false; + } + out->clear(); + for (;;) { + // Look for line delimiter + const char* found = static_cast( + std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_)); + if (found) { + size_t len = found - buf_begin_; + out->append(buf_begin_, len); + buf_begin_ += len + /*delim*/ 1; + ++line_number_; + return true; + } + if (at_eof_) { + status_.MustCheck(); + return false; + } + // else flush and reload buffer + out->append(buf_begin_, buf_end_ - buf_begin_); + Slice result; + status_ = sfr_.Read(buf_.size(), &result, buf_.data()); + if (!status_.ok()) { + status_.MustCheck(); + return false; + } + if (result.size() != buf_.size()) { + // The obscure way of indicating EOF + at_eof_ = true; + } + buf_begin_ = result.data(); + buf_end_ = result.data() + result.size(); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/file/line_file_reader.h b/file/line_file_reader.h new file mode 100644 index 00000000000..48d79f327c0 --- /dev/null +++ b/file/line_file_reader.h @@ -0,0 +1,59 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include "file/sequence_file_reader.h" + +namespace ROCKSDB_NAMESPACE { + +// A wrapper on top of Env::SequentialFile for reading text lines from a file. +// Lines are delimited by '\n'. The last line may or may not include a +// trailing newline. Uses SequentialFileReader internally. +class LineFileReader { + private: + std::array buf_; + SequentialFileReader sfr_; + Status status_; + const char* buf_begin_ = buf_.data(); + const char* buf_end_ = buf_.data(); + size_t line_number_ = 0; + bool at_eof_ = false; + + public: + // See SequentialFileReader constructors + template + explicit LineFileReader(Args&&... args) + : sfr_(std::forward(args)...) {} + + static Status Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); + + LineFileReader(const LineFileReader&) = delete; + LineFileReader& operator=(const LineFileReader&) = delete; + + // Reads another line from the file, returning true on success and saving + // the line to `out`, without delimiter, or returning false on failure. You + // must check GetStatus() to determine whether the failure was just + // end-of-file (OK status) or an I/O error (another status). + bool ReadLine(std::string* out); + + // Returns the number of the line most recently returned from ReadLine. + // Return value is unspecified if ReadLine has returned false due to + // I/O error. After ReadLine returns false due to end-of-file, return + // value is the last returned line number, or equivalently the total + // number of lines returned. + size_t GetLineNumber() const { return line_number_; } + + // Returns any error encountered during read. The error is considered + // permanent and no retry or recovery is attempted with the same + // LineFileReader. + const Status& GetStatus() const { return status_; } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index ffe0367a4b2..79b56b944fe 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -25,7 +25,7 @@ class MockRandomAccessFile : public FSRandomAccessFileWrapper { prefetch_count_.fetch_add(1); return target()->Prefetch(offset, n, options, dbg); } else { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("Prefetch not supported"); } } @@ -37,9 +37,9 @@ class MockRandomAccessFile : public FSRandomAccessFileWrapper { class MockFS : public FileSystemWrapper { public: - explicit MockFS(bool support_prefetch) - : FileSystemWrapper(FileSystem::Default()), - support_prefetch_(support_prefetch) {} + explicit MockFS(const std::shared_ptr& wrapped, + bool support_prefetch) + : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {} IOStatus NewRandomAccessFile(const std::string& fname, const FileOptions& opts, @@ -57,6 +57,10 @@ class MockFS : public FileSystemWrapper { bool IsPrefetchCalled() { return prefetch_count_ > 0; } + int GetPrefetchCount() { + return prefetch_count_.load(std::memory_order_relaxed); + } + private: const bool support_prefetch_; std::atomic_int prefetch_count_{0}; @@ -69,19 +73,25 @@ class PrefetchTest PrefetchTest() : DBTestBase("/prefetch_test", true) {} }; +INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); + std::string BuildKey(int num, std::string postfix = "") { return "my_key_" + std::to_string(num) + postfix; } TEST_P(PrefetchTest, Basic) { // First param is if the mockFS support_prefetch or not - bool support_prefetch = std::get<0>(GetParam()); + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); // Second param is if directIO is enabled or not bool use_direct_io = std::get<1>(GetParam()); - const int kNumKeys = 1100; - std::shared_ptr fs = std::make_shared(support_prefetch); + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); Options options = CurrentOptions(); options.write_buffer_size = 1024; @@ -109,21 +119,21 @@ TEST_P(PrefetchTest, Basic) { // create first key range WriteBatch batch; for (int i = 0; i < kNumKeys; i++) { - batch.Put(BuildKey(i), "value for range 1 key"); + ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key")); } ASSERT_OK(db_->Write(WriteOptions(), &batch)); // create second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - batch.Put(BuildKey(i, "key2"), "value for range 2 key"); + ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key")); } ASSERT_OK(db_->Write(WriteOptions(), &batch)); // delete second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - batch.Delete(BuildKey(i, "key2")); + ASSERT_OK(batch.Delete(BuildKey(i, "key2"))); } ASSERT_OK(db_->Write(WriteOptions(), &batch)); @@ -134,7 +144,7 @@ TEST_P(PrefetchTest, Basic) { Slice greatest(end_key.data(), end_key.size()); // commenting out the line below causes the example to work correctly - db_->CompactRange(CompactRangeOptions(), &least, &greatest); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); if (support_prefetch && !use_direct_io) { // If underline file system supports prefetch, and directIO is not enabled @@ -173,9 +183,491 @@ TEST_P(PrefetchTest, Basic) { Close(); } -INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, - ::testing::Combine(::testing::Bool(), - ::testing::Bool())); +#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + options.disable_auto_compactions = true; + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.max_auto_readahead_size = 0; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + // DB open will create table readers unless we reduce the table cache + // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table + // cache is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 10 so table cache capacity will become 0. This will + // prevent file open during DB open and force the file to be opened during + // Iteration. + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + Random rnd(309); + int key_count = 0; + const int num_keys_per_level = 100; + // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299]. + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + for (int i = 0; i < num_keys_per_level; ++i) { + ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + Close(); + std::vector buff_prefectch_level_count = {0, 0, 0}; + TryReopen(options); + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + switch (level) { + case 0: + // max_auto_readahead_size is set 0 so data and index blocks are not + // prefetched. + ASSERT_OK(db_->SetOptions( + {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}})); + break; + case 1: + // max_auto_readahead_size is set less than + // BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains + // equal to max_auto_readahead_size. + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{max_auto_readahead_size=4096;}"}})); + break; + case 2: + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{max_auto_readahead_size=65536;}"}})); + break; + default: + assert(false); + } + + for (int i = 0; i < num_keys_per_level; ++i) { + iter->Seek(Key(key_count++)); + iter->Next(); + } + + buff_prefectch_level_count[level] = buff_prefetch_count; + if (support_prefetch && !use_direct_io) { + if (level == 0) { + ASSERT_FALSE(fs->IsPrefetchCalled()); + } else { + ASSERT_TRUE(fs->IsPrefetchCalled()); + } + fs->ClearPrefetchCount(); + } else { + ASSERT_FALSE(fs->IsPrefetchCalled()); + if (level == 0) { + ASSERT_EQ(buff_prefetch_count, 0); + } else { + ASSERT_GT(buff_prefetch_count, 0); + } + buff_prefetch_count = 0; + } + } + } + + if (!support_prefetch) { + ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} +#endif // !ROCKSDB_LITE + +TEST_P(PrefetchTest, PrefetchWhenReseek) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + const int kNumKeys = 2000; + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + /* + * Reseek keys from sequential Data Blocks within same partitioned + * index. After 2 sequential reads it will prefetch the data block. + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more + * initially (2 more data blocks). + */ + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // Prefetch Data + iter->Seek(BuildKey(1008)); + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); // Prefetch Data + iter->Seek(BuildKey(1019)); + // Missed 2 blocks but they are already in buffer so no reset. + iter->Seek(BuildKey(103)); // Already in buffer. + iter->Seek(BuildKey(1033)); // Prefetch Data + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 3); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 3); + buff_prefetch_count = 0; + } + } + { + /* + * Reseek keys from non sequential data blocks within same partitioned + * index. buff_prefetch_count will be 0 in that case. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1008)); + iter->Seek(BuildKey(1019)); + iter->Seek(BuildKey(1033)); + iter->Seek(BuildKey(1048)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 0); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + } + { + /* + * Reesek keys from Single Data Block. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1)); + iter->Seek(BuildKey(10)); + iter->Seek(BuildKey(100)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 0); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + } + { + /* + * Reseek keys from sequential data blocks to set implicit auto readahead + * and prefetch data but after that iterate over different (non sequential) + * data blocks which won't prefetch any data further. So buff_prefetch_count + * will be 1 for the first one. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // This iteration will prefetch buffer + iter->Seek(BuildKey(1008)); + iter->Seek( + BuildKey(996)); // Reseek won't prefetch any data and + // readahead_size will be initiallized to 8*1024. + iter->Seek(BuildKey(992)); + iter->Seek(BuildKey(989)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 1); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + buff_prefetch_count = 0; + } + + // Read sequentially to confirm readahead_size is reset to initial value (2 + // more data blocks) + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); // Prefetch Data + iter->Seek(BuildKey(1022)); + iter->Seek(BuildKey(1026)); + iter->Seek(BuildKey(103)); // Prefetch Data + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 2); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 2); + buff_prefetch_count = 0; + } + } + { + /* Reseek keys from sequential partitioned index block. Since partitioned + * index fetch are sequential, buff_prefetch_count will be 1. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1167)); + iter->Seek(BuildKey(1334)); // This iteration will prefetch buffer + iter->Seek(BuildKey(1499)); + iter->Seek(BuildKey(1667)); + iter->Seek(BuildKey(1847)); + iter->Seek(BuildKey(1999)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 1); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + buff_prefetch_count = 0; + } + } + { + /* + * Reseek over different keys from different blocks. buff_prefetch_count is + * set 0. + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + int i = 0; + int j = 1000; + do { + iter->Seek(BuildKey(i)); + if (!iter->Valid()) { + break; + } + i = i + 100; + iter->Seek(BuildKey(j)); + j = j + 100; + } while (i < 1000 && j < kNumKeys && iter->Valid()); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 0); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 0); + buff_prefetch_count = 0; + } + } + { + /* Iterates sequentially over all keys. It will prefetch the buffer.*/ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + } + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 13); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 13); + buff_prefetch_count = 0; + } + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} + +TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + const int kNumKeys = 2000; + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + + BlockBasedTableOptions table_options; + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + { + /* + * Reseek keys from sequential Data Blocks within same partitioned + * index. After 2 sequential reads it will prefetch the data block. + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more + * initially (2 more data blocks). + */ + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + // Warm up the cache + iter->Seek(BuildKey(1011)); + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 1); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + buff_prefetch_count = 0; + } + } + { + // After caching, blocks will be read from cache (Sequential blocks) + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + iter->Seek(BuildKey(0)); + iter->Seek(BuildKey(1000)); + iter->Seek(BuildKey(1004)); // Prefetch data (not in cache). + // Missed one sequential block but next is in already in buffer so readahead + // will not be reset. + iter->Seek(BuildKey(1011)); + // Prefetch data but blocks are in cache so no prefetch and reset. + iter->Seek(BuildKey(1015)); + iter->Seek(BuildKey(1019)); + iter->Seek(BuildKey(1022)); + // Prefetch data with readahead_size = 4 blocks. + iter->Seek(BuildKey(1026)); + iter->Seek(BuildKey(103)); + iter->Seek(BuildKey(1033)); + iter->Seek(BuildKey(1037)); + + if (support_prefetch && !use_direct_io) { + ASSERT_EQ(fs->GetPrefetchCount(), 3); + fs->ClearPrefetchCount(); + } else { + ASSERT_EQ(buff_prefetch_count, 2); + buff_prefetch_count = 0; + } + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} } // namespace ROCKSDB_NAMESPACE diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 4d3c1a7f4bf..e15b6b0338d 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -12,6 +12,7 @@ #include #include +#include "file/file_util.h" #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" @@ -21,18 +22,29 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { +IOStatus RandomAccessFileReader::Create( + const std::shared_ptr& fs, const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* reader, IODebugContext* dbg) { + std::unique_ptr file; + IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg); + if (io_s.ok()) { + reader->reset(new RandomAccessFileReader(std::move(file), fname)); + } + return io_s; +} -Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, - size_t n, Slice* result, char* scratch, - AlignedBuf* aligned_buf, - bool for_compaction) const { +IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, + size_t n, Slice* result, char* scratch, + AlignedBuf* aligned_buf, + bool for_compaction) const { (void)aligned_buf; TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr); - Status s; + IOStatus io_s; uint64_t elapsed = 0; { - StopWatch sw(env_, stats_, hist_type_, + StopWatch sw(clock_, stats_, hist_type_, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -68,28 +80,28 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, } { - IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); // Only user reads are expected to specify a timeout. And user reads // are not subjected to rate_limiter and should go through only // one iteration of this loop, so we don't need to check and adjust // the opts.timeout before calling file_->Read assert(!opts.timeout.count() || allowed == read_size); - s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts, - &tmp, buf.Destination(), nullptr); + io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts, + &tmp, buf.Destination(), nullptr); } if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, - s); + io_s); } buf.Size(buf.CurrentSize() + tmp.size()); - if (!s.ok() || tmp.size() < allowed) { + if (!io_s.ok() || tmp.size() < allowed) { break; } } size_t res_len = 0; - if (s.ok() && offset_advance < buf.CurrentSize()) { + if (io_s.ok() && offset_advance < buf.CurrentSize()) { res_len = std::min(buf.CurrentSize() - offset_advance, n); if (aligned_buf == nullptr) { buf.Read(scratch, offset_advance, res_len); @@ -128,20 +140,20 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, #endif { - IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); // Only user reads are expected to specify a timeout. And user reads // are not subjected to rate_limiter and should go through only // one iteration of this loop, so we don't need to check and adjust // the opts.timeout before calling file_->Read assert(!opts.timeout.count() || allowed == n); - s = file_->Read(offset + pos, allowed, opts, &tmp_result, - scratch + pos, nullptr); + io_s = file_->Read(offset + pos, allowed, opts, &tmp_result, + scratch + pos, nullptr); } #ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, - finish_ts, s); + finish_ts, io_s); } #endif @@ -154,11 +166,11 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, assert(tmp_result.data() == res_scratch + pos); } pos += tmp_result.size(); - if (!s.ok() || tmp_result.size() < allowed) { + if (!io_s.ok() || tmp_result.size() < allowed) { break; } } - *result = Slice(res_scratch, s.ok() ? pos : 0); + *result = Slice(res_scratch, io_s.ok() ? pos : 0); } IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); SetPerfLevel(prev_perf_level); @@ -167,7 +179,7 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, file_read_hist_->Add(elapsed); } - return s; + return io_s; } size_t End(const FSReadRequest& r) { @@ -196,16 +208,16 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) { return true; } -Status RandomAccessFileReader::MultiRead(const IOOptions& opts, - FSReadRequest* read_reqs, - size_t num_reqs, - AlignedBuf* aligned_buf) const { +IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, + FSReadRequest* read_reqs, + size_t num_reqs, + AlignedBuf* aligned_buf) const { (void)aligned_buf; // suppress warning of unused variable in LITE mode assert(num_reqs > 0); - Status s; + IOStatus io_s; uint64_t elapsed = 0; { - StopWatch sw(env_, stats_, hist_type_, + StopWatch sw(clock_, stats_, hist_type_, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -221,11 +233,19 @@ Status RandomAccessFileReader::MultiRead(const IOOptions& opts, aligned_reqs.reserve(num_reqs); // Align and merge the read requests. size_t alignment = file_->GetRequiredBufferAlignment(); - aligned_reqs.push_back(Align(read_reqs[0], alignment)); - for (size_t i = 1; i < num_reqs; i++) { + for (size_t i = 0; i < num_reqs; i++) { const auto& r = Align(read_reqs[i], alignment); - if (!TryMerge(&aligned_reqs.back(), r)) { + if (i == 0) { + // head + aligned_reqs.push_back(r); + + } else if (!TryMerge(&aligned_reqs.back(), r)) { + // head + n aligned_reqs.push_back(r); + + } else { + // unused + r.status.PermitUncheckedError(); } } TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs", @@ -259,8 +279,8 @@ Status RandomAccessFileReader::MultiRead(const IOOptions& opts, #endif // ROCKSDB_LITE { - IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); - s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr); + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); + io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr); } #ifndef ROCKSDB_LITE @@ -301,7 +321,15 @@ Status RandomAccessFileReader::MultiRead(const IOOptions& opts, file_read_hist_->Add(elapsed); } - return s; + return io_s; } +IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, + IOOptions& opts) { + if (clock_ != nullptr) { + return PrepareIOFromReadOptions(ro, clock_, opts); + } else { + return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts); + } +} } // namespace ROCKSDB_NAMESPACE diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index a0f7a191736..181f4dd0295 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -14,7 +14,6 @@ #include "env/file_system_tracer.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/file_system.h" #include "rocksdb/listener.h" #include "rocksdb/options.h" @@ -24,6 +23,7 @@ namespace ROCKSDB_NAMESPACE { class Statistics; class HistogramImpl; +class SystemClock; using AlignedBuf = std::unique_ptr; @@ -38,7 +38,7 @@ FSReadRequest Align(const FSReadRequest& r, size_t alignment); // Otherwise, do nothing and return false. bool TryMerge(FSReadRequest* dest, const FSReadRequest& src); -// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is +// RandomAccessFileReader is a wrapper on top of Env::RandomAccessFile. It is // responsible for: // - Handling Buffered and Direct reads appropriately. // - Rate limiting compaction reads. @@ -67,7 +67,7 @@ class RandomAccessFileReader { FSRandomAccessFilePtr file_; std::string file_name_; - Env* env_; + SystemClock* clock_; Statistics* stats_; uint32_t hist_type_; HistogramImpl* file_read_hist_; @@ -77,14 +77,15 @@ class RandomAccessFileReader { public: explicit RandomAccessFileReader( std::unique_ptr&& raf, const std::string& _file_name, - Env* _env = nullptr, const std::shared_ptr& io_tracer = nullptr, + SystemClock* clock = nullptr, + const std::shared_ptr& io_tracer = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, const std::vector>& listeners = {}) - : file_(std::move(raf), io_tracer), + : file_(std::move(raf), io_tracer, _file_name), file_name_(std::move(_file_name)), - env_(_env), + clock_(clock), stats_(stats), hist_type_(hist_type), file_read_hist_(file_read_hist), @@ -102,6 +103,10 @@ class RandomAccessFileReader { #endif } + static IOStatus Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); RandomAccessFileReader(const RandomAccessFileReader&) = delete; RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; @@ -115,19 +120,19 @@ class RandomAccessFileReader { // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns // the internally allocated buffer on return, and the result refers to a // region in aligned_buf. - Status Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, - char* scratch, AlignedBuf* aligned_buf, - bool for_compaction = false) const; + IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, + char* scratch, AlignedBuf* aligned_buf, + bool for_compaction = false) const; // REQUIRES: // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing. // In non-direct IO mode, aligned_buf should be null; // In direct IO mode, aligned_buf stores the aligned buffer allocated inside // MultiRead, the result Slices in reqs refer to aligned_buf. - Status MultiRead(const IOOptions& opts, FSReadRequest* reqs, size_t num_reqs, - AlignedBuf* aligned_buf) const; + IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs, + size_t num_reqs, AlignedBuf* aligned_buf) const; - Status Prefetch(uint64_t offset, size_t n) const { + IOStatus Prefetch(uint64_t offset, size_t n) const { return file_->Prefetch(offset, n, IOOptions(), nullptr); } @@ -137,6 +142,6 @@ class RandomAccessFileReader { bool use_direct_io() const { return file_->use_direct_io(); } - Env* env() const { return env_; } + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts); }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc index 1358fd4b0f8..77a0e84a0be 100644 --- a/file/random_access_file_reader_test.cc +++ b/file/random_access_file_reader_test.cc @@ -38,11 +38,12 @@ class RandomAccessFileReaderTest : public testing::Test { } void Read(const std::string& fname, const FileOptions& opts, - std::unique_ptr* reader) { + std::unique_ptr* reader) { std::string fpath = Path(fname); std::unique_ptr f; ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr)); - (*reader).reset(new RandomAccessFileReader(std::move(f), fpath, env_)); + reader->reset(new RandomAccessFileReader(std::move(f), fpath, + env_->GetSystemClock().get())); } void AssertResult(const std::string& content, @@ -145,6 +146,7 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { // Reads the first page internally. ASSERT_EQ(aligned_reqs.size(), 1); const FSReadRequest& aligned_r = aligned_reqs[0]; + ASSERT_OK(aligned_r.status); ASSERT_EQ(aligned_r.offset, 0); ASSERT_EQ(aligned_r.len, page_size); } @@ -189,6 +191,7 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { // Reads the first two pages in one request internally. ASSERT_EQ(aligned_reqs.size(), 1); const FSReadRequest& aligned_r = aligned_reqs[0]; + ASSERT_OK(aligned_r.status); ASSERT_EQ(aligned_r.offset, 0); ASSERT_EQ(aligned_r.len, 2 * page_size); } @@ -233,6 +236,7 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { // Reads the first 3 pages in one request internally. ASSERT_EQ(aligned_reqs.size(), 1); const FSReadRequest& aligned_r = aligned_reqs[0]; + ASSERT_OK(aligned_r.status); ASSERT_EQ(aligned_r.offset, 0); ASSERT_EQ(aligned_r.len, 3 * page_size); } @@ -270,8 +274,10 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { ASSERT_EQ(aligned_reqs.size(), 2); const FSReadRequest& aligned_r0 = aligned_reqs[0]; const FSReadRequest& aligned_r1 = aligned_reqs[1]; + ASSERT_OK(aligned_r0.status); ASSERT_EQ(aligned_r0.offset, 0); ASSERT_EQ(aligned_r0.len, page_size); + ASSERT_OK(aligned_r1.status); ASSERT_EQ(aligned_r1.offset, 2 * page_size); ASSERT_EQ(aligned_r1.len, page_size); } @@ -287,8 +293,11 @@ TEST(FSReadRequest, Align) { r.offset = 2000; r.len = 2000; r.scratch = nullptr; + ASSERT_OK(r.status); FSReadRequest aligned_r = Align(r, 1024); + ASSERT_OK(r.status); + ASSERT_OK(aligned_r.status); ASSERT_EQ(aligned_r.offset, 1024); ASSERT_EQ(aligned_r.len, 3072); } @@ -303,14 +312,20 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 15; src.len = 10; src.scratch = nullptr; + ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_FALSE(TryMerge(&dest, src)); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } { @@ -320,16 +335,22 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 10; src.len = 10; src.scratch = nullptr; + ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 20); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } { @@ -339,16 +360,22 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 5; src.len = 10; src.scratch = nullptr; + ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 15); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } { @@ -358,16 +385,22 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 5; src.len = 5; src.scratch = nullptr; + ASSERT_OK(src.status); - if (reverse) std::swap(dest, src); + if (reverse) { + std::swap(dest, src); + } ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } { @@ -377,16 +410,20 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 5; src.len = 1; src.scratch = nullptr; + ASSERT_OK(src.status); if (reverse) std::swap(dest, src); ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } { @@ -396,16 +433,20 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 0; src.len = 10; src.scratch = nullptr; + ASSERT_OK(src.status); if (reverse) std::swap(dest, src); ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } { @@ -415,16 +456,20 @@ TEST(FSReadRequest, TryMerge) { dest.offset = 0; dest.len = 10; dest.scratch = nullptr; + ASSERT_OK(dest.status); FSReadRequest src; src.offset = 0; src.len = 5; src.scratch = nullptr; + ASSERT_OK(src.status); if (reverse) std::swap(dest, src); ASSERT_TRUE(TryMerge(&dest, src)); ASSERT_EQ(dest.offset, 0); ASSERT_EQ(dest.len, 10); + ASSERT_OK(dest.status); + ASSERT_OK(src.status); } } } diff --git a/file/read_write_util.cc b/file/read_write_util.cc index b4854e110fa..9df6c5a39d8 100644 --- a/file/read_write_util.cc +++ b/file/read_write_util.cc @@ -18,47 +18,10 @@ IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, std::unique_ptr* result, const FileOptions& options) { IOStatus s = fs->NewWritableFile(fname, options, result, nullptr); - TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2); return s; } -bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader, - std::string* output, bool* has_data, Status* result) { - const int kBufferSize = 8192; - char buffer[kBufferSize + 1]; - Slice input_slice; - - std::string line; - bool has_complete_line = false; - while (!has_complete_line) { - if (std::getline(*iss, line)) { - has_complete_line = !iss->eof(); - } else { - has_complete_line = false; - } - if (!has_complete_line) { - // if we're not sure whether we have a complete line, - // further read from the file. - if (*has_data) { - *result = seq_file_reader->Read(kBufferSize, &input_slice, buffer); - } - if (input_slice.size() == 0) { - // meaning we have read all the data - *has_data = false; - break; - } else { - iss->str(line + input_slice.ToString()); - // reset the internal state of iss so that we can keep reading it. - iss->clear(); - *has_data = (input_slice.size() == kBufferSize); - continue; - } - } - } - *output = line; - return *has_data || has_complete_line; -} - #ifndef NDEBUG bool IsFileSectorAligned(const size_t off, size_t sector_size) { return off % sector_size == 0; diff --git a/file/read_write_util.h b/file/read_write_util.h index 22f4076b344..718135c9885 100644 --- a/file/read_write_util.h +++ b/file/read_write_util.h @@ -24,10 +24,6 @@ extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, std::unique_ptr* result, const FileOptions& options); -// Read a single line from a file. -bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader, - std::string* output, bool* has_data, Status* result); - #ifndef NDEBUG bool IsFileSectorAligned(const size_t off, size_t sector_size); #endif // NDEBUG diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc index 493f9d9e895..6d346432e22 100644 --- a/file/readahead_raf.cc +++ b/file/readahead_raf.cc @@ -11,15 +11,17 @@ #include #include + #include "file/read_write_util.h" +#include "rocksdb/file_system.h" #include "util/aligned_buffer.h" #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { namespace { -class ReadaheadRandomAccessFile : public RandomAccessFile { +class ReadaheadRandomAccessFile : public FSRandomAccessFile { public: - ReadaheadRandomAccessFile(std::unique_ptr&& file, + ReadaheadRandomAccessFile(std::unique_ptr&& file, size_t readahead_size) : file_(std::move(file)), alignment_(file_->GetRequiredBufferAlignment()), @@ -35,11 +37,12 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete; - Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { // Read-ahead only make sense if we have some slack left after reading if (n + alignment_ >= readahead_size_) { - return file_->Read(offset, n, result, scratch); + return file_->Read(offset, n, options, result, scratch, dbg); } std::unique_lock lk(lock_); @@ -53,14 +56,14 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { // We read exactly what we needed, or we hit end of file - return. *result = Slice(scratch, cached_len); - return Status::OK(); + return IOStatus::OK(); } size_t advanced_offset = static_cast(offset + cached_len); // In the case of cache hit advanced_offset is already aligned, means that // chunk_offset equals to advanced_offset size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); - Status s = ReadIntoBuffer(chunk_offset, readahead_size_); + IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg); if (s.ok()) { // The data we need is now in cache, so we can safely read it size_t remaining_len; @@ -71,11 +74,12 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { return s; } - Status Prefetch(uint64_t offset, size_t n) override { + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { if (n < readahead_size_) { // Don't allow smaller prefetches than the configured `readahead_size_`. // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. - return Status::OK(); + return IOStatus::OK(); } std::unique_lock lk(lock_); @@ -83,10 +87,11 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { size_t offset_ = static_cast(offset); size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); if (prefetch_offset == buffer_offset_) { - return Status::OK(); + return IOStatus::OK(); } return ReadIntoBuffer(prefetch_offset, - Roundup(offset_ + n, alignment_) - prefetch_offset); + Roundup(offset_ + n, alignment_) - prefetch_offset, + options, dbg); } size_t GetUniqueId(char* id, size_t max_size) const override { @@ -95,7 +100,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { void Hint(AccessPattern pattern) override { file_->Hint(pattern); } - Status InvalidateCache(size_t offset, size_t length) override { + IOStatus InvalidateCache(size_t offset, size_t length) override { std::unique_lock lk(lock_); buffer_.Clear(); return file_->InvalidateCache(offset, length); @@ -125,14 +130,16 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { // Reads into buffer_ the next n bytes from file_ starting at offset. // Can actually read less if EOF was reached. // Returns the status of the read operastion on the file. - Status ReadIntoBuffer(uint64_t offset, size_t n) const { + IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) const { if (n > buffer_.Capacity()) { n = buffer_.Capacity(); } assert(IsFileSectorAligned(offset, alignment_)); assert(IsFileSectorAligned(n, alignment_)); Slice result; - Status s = file_->Read(offset, n, &result, buffer_.BufferStart()); + IOStatus s = + file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg); if (s.ok()) { buffer_offset_ = offset; buffer_.Size(result.size()); @@ -141,7 +148,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { return s; } - const std::unique_ptr file_; + const std::unique_ptr file_; const size_t alignment_; const size_t readahead_size_; @@ -153,9 +160,9 @@ class ReadaheadRandomAccessFile : public RandomAccessFile { }; } // namespace -std::unique_ptr NewReadaheadRandomAccessFile( - std::unique_ptr&& file, size_t readahead_size) { - std::unique_ptr result( +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size) { + std::unique_ptr result( new ReadaheadRandomAccessFile(std::move(file), readahead_size)); return result; } diff --git a/file/readahead_raf.h b/file/readahead_raf.h index cbdcb124fd0..dfaf2b4fa94 100644 --- a/file/readahead_raf.h +++ b/file/readahead_raf.h @@ -8,10 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include -#include "rocksdb/env.h" +#include + +#include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { +class FSRandomAccessFile; // This file provides the following main abstractions: // SequentialFileReader : wrapper over Env::SequentialFile // RandomAccessFileReader : wrapper over Env::RandomAccessFile @@ -22,6 +24,6 @@ namespace ROCKSDB_NAMESPACE { // NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to // always prefetch additional data with every read. This is mainly used in // Compaction Table Readers. -std::unique_ptr NewReadaheadRandomAccessFile( - std::unique_ptr&& file, size_t readahead_size); +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size); } // namespace ROCKSDB_NAMESPACE diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc index 81c5e5d1d46..3a87b6d102f 100644 --- a/file/sequence_file_reader.cc +++ b/file/sequence_file_reader.cc @@ -22,6 +22,18 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { +Status SequentialFileReader::Create( + const std::shared_ptr& fs, const std::string& fname, + const FileOptions& file_opts, std::unique_ptr* reader, + IODebugContext* dbg) { + std::unique_ptr file; + Status s = fs->NewSequentialFile(fname, file_opts, &file, dbg); + if (s.ok()) { + reader->reset(new SequentialFileReader(std::move(file), fname)); + } + return s; +} + Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { Status s; if (use_direct_io()) { diff --git a/file/sequence_file_reader.h b/file/sequence_file_reader.h index 139e5255fbc..ea315f853e2 100644 --- a/file/sequence_file_reader.h +++ b/file/sequence_file_reader.h @@ -31,7 +31,8 @@ class SequentialFileReader { explicit SequentialFileReader( std::unique_ptr&& _file, const std::string& _file_name, const std::shared_ptr& io_tracer = nullptr) - : file_name_(_file_name), file_(std::move(_file), io_tracer) {} + : file_name_(_file_name), + file_(std::move(_file), io_tracer, _file_name) {} explicit SequentialFileReader( std::unique_ptr&& _file, const std::string& _file_name, @@ -39,7 +40,11 @@ class SequentialFileReader { const std::shared_ptr& io_tracer = nullptr) : file_name_(_file_name), file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size), - io_tracer) {} + io_tracer, _file_name) {} + static Status Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* reader, + IODebugContext* dbg); SequentialFileReader(const SequentialFileReader&) = delete; SequentialFileReader& operator=(const SequentialFileReader&) = delete; diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index baf58d6b8e0..cc03e54441d 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -9,7 +9,6 @@ #include #include "db/db_impl/db_impl.h" -#include "env/composite_env_wrapper.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" @@ -19,21 +18,21 @@ namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE -SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr fs, - std::shared_ptr logger, - int64_t rate_bytes_per_sec, - double max_trash_db_ratio, - uint64_t bytes_max_delete_chunk) - : env_(env), +SstFileManagerImpl::SstFileManagerImpl( + const std::shared_ptr& clock, + const std::shared_ptr& fs, + const std::shared_ptr& logger, int64_t rate_bytes_per_sec, + double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) + : clock_(clock), fs_(fs), logger_(logger), total_files_size_(0), - in_progress_files_size_(0), compaction_buffer_size_(0), cur_compactions_reserved_size_(0), max_allowed_space_(0), - delete_scheduler_(env, fs_.get(), rate_bytes_per_sec, logger.get(), this, - max_trash_db_ratio, bytes_max_delete_chunk), + delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec, + logger.get(), this, max_trash_db_ratio, + bytes_max_delete_chunk), cv_(&mu_), closing_(false), bg_thread_(nullptr), @@ -60,23 +59,24 @@ void SstFileManagerImpl::Close() { } } -Status SstFileManagerImpl::OnAddFile(const std::string& file_path, - bool compaction) { +Status SstFileManagerImpl::OnAddFile(const std::string& file_path) { uint64_t file_size; Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr); if (s.ok()) { MutexLock l(&mu_); - OnAddFileImpl(file_path, file_size, compaction); + OnAddFileImpl(file_path, file_size); } - TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile"); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile", + const_cast(&file_path)); return s; } Status SstFileManagerImpl::OnAddFile(const std::string& file_path, - uint64_t file_size, bool compaction) { + uint64_t file_size) { MutexLock l(&mu_); - OnAddFileImpl(file_path, file_size, compaction); - TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile"); + OnAddFileImpl(file_path, file_size); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile", + const_cast(&file_path)); return Status::OK(); } @@ -85,7 +85,8 @@ Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) { MutexLock l(&mu_); OnDeleteFileImpl(file_path); } - TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile"); + TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile", + const_cast(&file_path)); return Status::OK(); } @@ -99,19 +100,6 @@ void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) { } } cur_compactions_reserved_size_ -= size_added_by_compaction; - - auto new_files = c->edit()->GetNewFiles(); - for (auto& new_file : new_files) { - auto fn = TableFileName(c->immutable_cf_options()->cf_paths, - new_file.second.fd.GetNumber(), - new_file.second.fd.GetPathId()); - if (in_progress_files_.find(fn) != in_progress_files_.end()) { - auto tracked_file = tracked_files_.find(fn); - assert(tracked_file != tracked_files_.end()); - in_progress_files_size_ -= tracked_file->second; - in_progress_files_.erase(fn); - } - } } Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, @@ -122,7 +110,7 @@ Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, if (file_size != nullptr) { *file_size = tracked_files_[old_path]; } - OnAddFileImpl(new_path, tracked_files_[old_path], false); + OnAddFileImpl(new_path, tracked_files_[old_path]); OnDeleteFileImpl(old_path); } TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile"); @@ -159,7 +147,7 @@ bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() { bool SstFileManagerImpl::EnoughRoomForCompaction( ColumnFamilyData* cfd, const std::vector& inputs, - Status bg_error) { + const Status& bg_error) { MutexLock l(&mu_); uint64_t size_added_by_compaction = 0; // First check if we even have the space to do the compaction @@ -184,7 +172,7 @@ bool SstFileManagerImpl::EnoughRoomForCompaction( // seen a NoSpace() error. This is tin order to contain a single potentially // misbehaving DB instance and prevent it from slowing down compactions of // other DB instances - if (bg_error == Status::NoSpace() && CheckFreeSpace()) { + if (bg_error.IsNoSpace() && CheckFreeSpace()) { auto fn = TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(), inputs[0][0]->fd.GetPathId()); @@ -199,7 +187,6 @@ bool SstFileManagerImpl::EnoughRoomForCompaction( if (compaction_buffer_size_ == 0) { needed_headroom += reserved_disk_buffer_; } - needed_headroom -= in_progress_files_size_; if (free_space < needed_headroom + size_added_by_compaction) { // We hit the condition of not enough disk space ROCKS_LOG_ERROR(logger_, @@ -348,7 +335,7 @@ void SstFileManagerImpl::ClearError() { if (!error_handler_list_.empty()) { // If there are more instances to be recovered, reschedule after 5 // seconds - int64_t wait_until = env_->NowMicros() + 5000000; + int64_t wait_until = clock_->NowMicros() + 5000000; cv_.TimedWait(wait_until); } @@ -440,24 +427,15 @@ void SstFileManagerImpl::WaitForEmptyTrash() { } void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, - uint64_t file_size, bool compaction) { + uint64_t file_size) { auto tracked_file = tracked_files_.find(file_path); if (tracked_file != tracked_files_.end()) { // File was added before, we will just update the size - assert(!compaction); total_files_size_ -= tracked_file->second; total_files_size_ += file_size; cur_compactions_reserved_size_ -= file_size; } else { total_files_size_ += file_size; - if (compaction) { - // Keep track of the size of files created by in-progress compactions. - // When calculating whether there's enough headroom for new compactions, - // this will be subtracted from cur_compactions_reserved_size_. - // Otherwise, compactions will be double counted. - in_progress_files_size_ += file_size; - in_progress_files_.insert(file_path); - } } tracked_files_[file_path] = file_size; } @@ -466,16 +444,10 @@ void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) { auto tracked_file = tracked_files_.find(file_path); if (tracked_file == tracked_files_.end()) { // File is not tracked - assert(in_progress_files_.find(file_path) == in_progress_files_.end()); return; } total_files_size_ -= tracked_file->second; - // Check if it belonged to an in-progress compaction - if (in_progress_files_.find(file_path) != in_progress_files_.end()) { - in_progress_files_size_ -= tracked_file->second; - in_progress_files_.erase(file_path); - } tracked_files_.erase(tracked_file); } @@ -485,14 +457,7 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr info_log, bool delete_existing_trash, Status* status, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) { - std::shared_ptr fs; - - if (env == Env::Default()) { - fs = FileSystem::Default(); - } else { - fs.reset(new LegacyFileSystemWrapper(env)); - } - + const auto& fs = env->GetFileSystem(); return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec, delete_existing_trash, status, max_trash_db_ratio, bytes_max_delete_chunk); @@ -505,8 +470,9 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr fs, bool delete_existing_trash, Status* status, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk) { + const auto& clock = env->GetSystemClock(); SstFileManagerImpl* res = - new SstFileManagerImpl(env, fs, info_log, rate_bytes_per_sec, + new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec, max_trash_db_ratio, bytes_max_delete_chunk); // trash_dir is deprecated and not needed anymore, but if user passed it @@ -517,10 +483,6 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr fs, s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr); if (s.ok()) { for (const std::string& trash_file : files_in_trash) { - if (trash_file == "." || trash_file == "..") { - continue; - } - std::string path_in_trash = trash_dir + "/" + trash_file; res->OnAddFile(path_in_trash); Status file_delete = diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index 9f4d99049f0..796a8df82fb 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -12,48 +12,45 @@ #include "port/port.h" #include "db/compaction/compaction.h" -#include "db/error_handler.h" #include "file/delete_scheduler.h" -#include "rocksdb/file_system.h" #include "rocksdb/sst_file_manager.h" namespace ROCKSDB_NAMESPACE { - -class Env; +class ErrorHandler; +class FileSystem; +class SystemClock; class Logger; -// SstFileManager is used to track SST files in the DB and control there -// deletion rate. -// All SstFileManager public functions are thread-safe. +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. class SstFileManagerImpl : public SstFileManager { public: - explicit SstFileManagerImpl(Env* env, std::shared_ptr fs, - std::shared_ptr logger, + explicit SstFileManagerImpl(const std::shared_ptr& clock, + const std::shared_ptr& fs, + const std::shared_ptr& logger, int64_t rate_bytes_per_sec, double max_trash_db_ratio, uint64_t bytes_max_delete_chunk); ~SstFileManagerImpl(); - // DB will call OnAddFile whenever a new sst file is added. - virtual Status OnAddFile(const std::string& file_path, - bool compaction = false); + // DB will call OnAddFile whenever a new sst/blob file is added. + virtual Status OnAddFile(const std::string& file_path); // Overload where size of the file is provided by the caller rather than // queried from the filesystem. This is an optimization. - Status OnAddFile(const std::string& file_path, uint64_t file_size, - bool compaction); + Status OnAddFile(const std::string& file_path, uint64_t file_size); - // DB will call OnDeleteFile whenever an sst file is deleted. + // DB will call OnDeleteFile whenever a sst/blob file is deleted. Status OnDeleteFile(const std::string& file_path); - // DB will call OnMoveFile whenever an sst file is move to a new path. + // DB will call OnMoveFile whenever a sst/blob file is move to a new path. Status OnMoveFile(const std::string& old_path, const std::string& new_path, uint64_t* file_size = nullptr); // Update the maximum allowed space that should be used by RocksDB, if - // the total size of the SST files exceeds max_allowed_space, writes to - // RocksDB will fail. + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. // // Setting max_allowed_space to 0 will disable this feature, maximum allowed // space will be infinite (Default value). @@ -63,8 +60,8 @@ class SstFileManagerImpl : public SstFileManager { void SetCompactionBufferSize(uint64_t compaction_buffer_size) override; - // Return true if the total size of SST files exceeded the maximum allowed - // space usage. + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. // // thread-safe. bool IsMaxAllowedSpaceReached() override; @@ -78,7 +75,7 @@ class SstFileManagerImpl : public SstFileManager { // the full compaction size). bool EnoughRoomForCompaction(ColumnFamilyData* cfd, const std::vector& inputs, - Status bg_error); + const Status& bg_error); // Bookkeeping so total_file_sizes_ goes back to normal after compaction // finishes @@ -143,8 +140,7 @@ class SstFileManagerImpl : public SstFileManager { private: // REQUIRES: mutex locked - void OnAddFileImpl(const std::string& file_path, uint64_t file_size, - bool compaction); + void OnAddFileImpl(const std::string& file_path, uint64_t file_size); // REQUIRES: mutex locked void OnDeleteFileImpl(const std::string& file_path); @@ -153,15 +149,13 @@ class SstFileManagerImpl : public SstFileManager { return bg_err_.severity() == Status::Severity::kSoftError; } - Env* env_; + std::shared_ptr clock_; std::shared_ptr fs_; std::shared_ptr logger_; // Mutex to protect tracked_files_, total_files_size_ port::Mutex mu_; // The summation of the sizes of all files in tracked_files_ map uint64_t total_files_size_; - // The summation of all output files of in-progress compactions - uint64_t in_progress_files_size_; // Compactions should only execute if they can leave at least // this amount of buffer space for logs and flushes uint64_t compaction_buffer_size_; @@ -170,9 +164,7 @@ class SstFileManagerImpl : public SstFileManager { // A map containing all tracked files and there sizes // file_path => file_size std::unordered_map tracked_files_; - // A set of files belonging to in-progress compactions - std::unordered_set in_progress_files_; - // The maximum allowed space (in bytes) for sst files. + // The maximum allowed space (in bytes) for sst and blob files. uint64_t max_allowed_space_; // DeleteScheduler used to throttle file deletition. DeleteScheduler delete_scheduler_; @@ -192,7 +184,7 @@ class SstFileManagerImpl : public SstFileManager { // compactions to run full throttle. If disk space is below this trigger, // compactions will be gated by free disk space > input size uint64_t free_space_trigger_; - // List of database error handler instances tracked by this sst file manager + // List of database error handler instances tracked by this SstFileManager. std::list error_handler_list_; // Pointer to ErrorHandler instance that is currently processing recovery ErrorHandler* cur_instance_; diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc index eafd8b66acc..d009542c727 100644 --- a/file/writable_file_writer.cc +++ b/file/writable_file_writer.cc @@ -16,19 +16,33 @@ #include "monitoring/histogram.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" +#include "util/crc32c.h" #include "util/random.h" #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { +Status WritableFileWriter::Create(const std::shared_ptr& fs, + const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* writer, + IODebugContext* dbg) { + std::unique_ptr file; + Status s = fs->NewWritableFile(fname, file_opts, &file, dbg); + if (s.ok()) { + writer->reset(new WritableFileWriter(std::move(file), fname, file_opts)); + } + return s; +} + IOStatus WritableFileWriter::Append(const Slice& data) { const char* src = data.data(); size_t left = data.size(); IOStatus s; pending_sync_ = true; - TEST_KILL_RANDOM("WritableFileWriter::Append:0", - rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2); // Calculate the checksum of appended data UpdateFileChecksum(data); @@ -89,7 +103,7 @@ IOStatus WritableFileWriter::Append(const Slice& data) { s = WriteBuffered(src, left); } - TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Append:1"); if (s.ok()) { filesize_ += data.size(); } @@ -177,7 +191,7 @@ IOStatus WritableFileWriter::Close() { } } - TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Close:0"); { #ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; @@ -198,7 +212,7 @@ IOStatus WritableFileWriter::Close() { } writable_file_.reset(); - TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Close:1"); if (s.ok() && checksum_generator_ != nullptr && !checksum_finalized_) { checksum_generator_->Finalize(); @@ -212,8 +226,7 @@ IOStatus WritableFileWriter::Close() { // enabled IOStatus WritableFileWriter::Flush() { IOStatus s; - TEST_KILL_RANDOM("WritableFileWriter::Flush:0", - rocksdb_kill_odds * REDUCE_ODDS2); + TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2); if (buf_.CurrentSize() > 0) { if (use_direct_io()) { @@ -302,14 +315,14 @@ IOStatus WritableFileWriter::Sync(bool use_fsync) { if (!s.ok()) { return s; } - TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Sync:0"); if (!use_direct_io() && pending_sync_) { s = SyncInternal(use_fsync); if (!s.ok()) { return s; } } - TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::Sync:1"); pending_sync_ = false; return IOStatus::OK(); } @@ -331,7 +344,7 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { IOSTATS_TIMER_GUARD(fsync_nanos); TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0"); auto prev_perf_level = GetPerfLevel(); - IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); #ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { @@ -381,6 +394,8 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) { assert(!use_direct_io()); const char* src = data; size_t left = size; + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; while (left > 0) { size_t allowed; @@ -406,8 +421,16 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) { #endif { auto prev_perf_level = GetPerfLevel(); - IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); - s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr); + + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); + if (perform_data_verification_) { + Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->Append(Slice(src, allowed), IOOptions(), v_info, + nullptr); + } else { + s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr); + } SetPerfLevel(prev_perf_level); } #ifndef ROCKSDB_LITE @@ -422,7 +445,7 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) { } IOSTATS_ADD(bytes_written, allowed); - TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0"); left -= allowed; src += allowed; @@ -437,6 +460,19 @@ void WritableFileWriter::UpdateFileChecksum(const Slice& data) { } } +// Currently, crc32c checksum is used to calculate the checksum value of the +// content in the input buffer for handoff. In the future, the checksum might be +// calculated from the existing crc32c checksums of the in WAl and Manifest +// records, or even SST file blocks. +// TODO: effectively use the existing checksum of the data being writing to +// generate the crc32c checksum instead of a raw calculation. +void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data, + size_t size, + char* buf) { + uint32_t v_crc32c = crc32c::Extend(0, data, size); + EncodeFixed32(buf, v_crc32c); +} + // This flushes the accumulated data in the buffer. We pad data with zeros if // necessary to the whole page. // However, during automatic flushes padding would not be necessary. @@ -467,6 +503,8 @@ IOStatus WritableFileWriter::WriteDirect() { const char* src = buf_.BufferStart(); uint64_t write_offset = next_write_offset_; size_t left = buf_.CurrentSize(); + DataVerificationInfo v_info; + char checksum_buf[sizeof(uint32_t)]; while (left > 0) { // Check how much is allowed @@ -487,8 +525,16 @@ IOStatus WritableFileWriter::WriteDirect() { start_ts = FileOperationInfo::StartNow(); } // direct writes must be positional - s = writable_file_->PositionedAppend(Slice(src, size), write_offset, - IOOptions(), nullptr); + if (perform_data_verification_) { + Crc32cHandoffChecksumCalculation(src, size, checksum_buf); + v_info.checksum = Slice(checksum_buf, sizeof(uint32_t)); + s = writable_file_->PositionedAppend(Slice(src, size), write_offset, + IOOptions(), v_info, nullptr); + } else { + s = writable_file_->PositionedAppend(Slice(src, size), write_offset, + IOOptions(), nullptr); + } + if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s); diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h index 51fbcc04b6f..e6894281cc2 100644 --- a/file/writable_file_writer.h +++ b/file/writable_file_writer.h @@ -14,7 +14,6 @@ #include "db/version_edit.h" #include "env/file_system_tracer.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/file_checksum.h" #include "rocksdb/file_system.h" #include "rocksdb/io_status.h" @@ -25,6 +24,7 @@ namespace ROCKSDB_NAMESPACE { class Statistics; +class SystemClock; // WritableFileWriter is a wrapper on top of Env::WritableFile. It provides // facilities to: @@ -118,10 +118,12 @@ class WritableFileWriter { bool ShouldNotifyListeners() const { return !listeners_.empty(); } void UpdateFileChecksum(const Slice& data); + void Crc32cHandoffChecksumCalculation(const char* data, size_t size, + char* buf); std::string file_name_; FSWritableFilePtr writable_file_; - Env* env_; + SystemClock* clock_; AlignedBuffer buf_; size_t max_buffer_size_; // Actually written data size can be used for truncate @@ -141,18 +143,20 @@ class WritableFileWriter { std::vector> listeners_; std::unique_ptr checksum_generator_; bool checksum_finalized_; + bool perform_data_verification_; public: WritableFileWriter( std::unique_ptr&& file, const std::string& _file_name, - const FileOptions& options, Env* env = nullptr, + const FileOptions& options, SystemClock* clock = nullptr, const std::shared_ptr& io_tracer = nullptr, Statistics* stats = nullptr, const std::vector>& listeners = {}, - FileChecksumGenFactory* file_checksum_gen_factory = nullptr) + FileChecksumGenFactory* file_checksum_gen_factory = nullptr, + bool perform_data_verification = false) : file_name_(_file_name), - writable_file_(std::move(file), io_tracer), - env_(env), + writable_file_(std::move(file), io_tracer, _file_name), + clock_(clock), buf_(), max_buffer_size_(options.writable_file_max_buffer_size), filesize_(0), @@ -166,7 +170,8 @@ class WritableFileWriter { stats_(stats), listeners_(), checksum_generator_(nullptr), - checksum_finalized_(false) { + checksum_finalized_(false), + perform_data_verification_(perform_data_verification) { TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", reinterpret_cast(max_buffer_size_)); buf_.Alignment(writable_file_->GetRequiredBufferAlignment()); @@ -190,6 +195,10 @@ class WritableFileWriter { } } + static Status Create(const std::shared_ptr& fs, + const std::string& fname, const FileOptions& file_opts, + std::unique_ptr* writer, + IODebugContext* dbg); WritableFileWriter(const WritableFileWriter&) = delete; WritableFileWriter& operator=(const WritableFileWriter&) = delete; diff --git a/fuzz/Makefile b/fuzz/Makefile new file mode 100644 index 00000000000..fa45b9e7831 --- /dev/null +++ b/fuzz/Makefile @@ -0,0 +1,61 @@ +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +# This source code is licensed under both the GPLv2 (found in the +# COPYING file in the root directory) and Apache 2.0 License +# (found in the LICENSE.Apache file in the root directory). + +ROOT_DIR = $(abspath $(shell pwd)/../) + +include $(ROOT_DIR)/make_config.mk + +PROTOBUF_CFLAGS = `pkg-config --cflags protobuf` +PROTOBUF_LDFLAGS = `pkg-config --libs protobuf` + +PROTOBUF_MUTATOR_CFLAGS = `pkg-config --cflags libprotobuf-mutator` +PROTOBUF_MUTATOR_LDFLAGS = `pkg-config --libs libprotobuf-mutator` + +ROCKSDB_INCLUDE_DIR = $(ROOT_DIR)/include +ROCKSDB_LIB_DIR = $(ROOT_DIR) + +PROTO_IN = $(ROOT_DIR)/fuzz/proto +PROTO_OUT = $(ROOT_DIR)/fuzz/proto/gen + +ifneq ($(FUZZ_ENV), ossfuzz) +CC = clang++ +CCFLAGS += -Wall -fsanitize=address,fuzzer +CFLAGS += $(PLATFORM_CXXFLAGS) $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR) +LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb +else +# OSS-Fuzz sets various environment flags that are used for compilation. +# These environment flags depend on which type of sanitizer build is being +# used, however, an ASan build would set the environment flags as follows: +# CFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \ + -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \ + -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link" +# CXXFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \ + -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \ + -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link \ + -stdlib=libc++" +# LIB_FUZZING_ENGINE="-fsanitize=fuzzer" +CC = $(CXX) +CCFLAGS = $(CXXFLAGS) +CFLAGS += $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR) +LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb +endif + +.PHONY: gen_proto + +gen_proto: + mkdir -p $(PROTO_OUT) + protoc \ + --proto_path=$(PROTO_IN) \ + --cpp_out=$(PROTO_OUT) \ + $(PROTO_IN)/*.proto + +db_fuzzer: db_fuzzer.cc + $(CC) $(CCFLAGS) -o db_fuzzer db_fuzzer.cc $(CFLAGS) $(LDFLAGS) + +db_map_fuzzer: gen_proto db_map_fuzzer.cc proto/gen/db_operation.pb.cc + $(CC) $(CCFLAGS) -o db_map_fuzzer db_map_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS) + +sst_file_writer_fuzzer: gen_proto sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc + $(CC) $(CCFLAGS) -o sst_file_writer_fuzzer sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS) diff --git a/fuzz/README.md b/fuzz/README.md new file mode 100644 index 00000000000..79b89bbc330 --- /dev/null +++ b/fuzz/README.md @@ -0,0 +1,160 @@ +# Fuzzing RocksDB + +## Overview + +This directory contains [fuzz tests](https://en.wikipedia.org/wiki/Fuzzing) for RocksDB. +RocksDB testing infrastructure currently includes unit tests and [stress tests](https://github.com/facebook/rocksdb/wiki/Stress-test), +we hope fuzz testing can catch more bugs. + +## Prerequisite + +We use [LLVM libFuzzer](http://llvm.org/docs/LibFuzzer.html) as the fuzzying engine, +so make sure you have [clang](https://clang.llvm.org/get_started.html) as your compiler. + +Some tests rely on [structure aware fuzzing](https://github.com/google/fuzzing/blob/master/docs/structure-aware-fuzzing.md). +We use [protobuf](https://developers.google.com/protocol-buffers) to define structured input to the fuzzer, +and use [libprotobuf-mutator](https://github.com/google/libprotobuf-mutator) as the custom libFuzzer mutator. +So make sure you have protobuf and libprotobuf-mutator installed, and make sure `pkg-config` can find them. + +## Example + +This example shows you how to do structure aware fuzzing to `rocksdb::SstFileWriter`. + +After walking through the steps to create the fuzzer, we'll introduce a bug into `rocksdb::SstFileWriter::Put`, +then show that the fuzzer can catch the bug. + +### Design the test + +We want the fuzzing engine to automatically generate a list of database operations, +then we apply these operations to `SstFileWriter` in sequence, +finally, after the SST file is generated, we use `SstFileReader` to check the file's checksum. + +### Define input + +We define the database operations in protobuf, each operation has a type of operation and a key value pair, +see [proto/db_operation.proto](proto/db_operation.proto) for details. + +### Define tests with the input + +In [sst_file_writer_fuzzer.cc](sst_file_writer_fuzzer.cc), +we define the tests to be run on the generated input: + +``` +DEFINE_PROTO_FUZZER(DBOperations& input) { + // apply the operations to SstFileWriter and use SstFileReader to verify checksum. + // ... +} +``` + +`SstFileWriter` requires the keys of the operations to be unique and be in ascending order, +but the fuzzing engine generates the input randomly, so we need to process the generated input before +passing it to `DEFINE_PROTO_FUZZER`, this is accomplished by registering a post processor: + +``` +protobuf_mutator::libfuzzer::PostProcessorRegistration +``` + +### Compile and link the fuzzer + +In the rocksdb root directory, compile rocksdb library by `make static_lib`. + +Go to the `fuzz` directory, +run `make sst_file_writer_fuzzer` to generate the fuzzer, +it will compile rocksdb static library, generate protobuf, then compile and link `sst_file_writer_fuzzer`. + +### Introduce a bug + +Manually introduce a bug to `SstFileWriter::Put`: + +``` +diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc +index ab1ee7c4e..c7da9ffa0 100644 +--- a/table/sst_file_writer.cc ++++ b/table/sst_file_writer.cc +@@ -277,6 +277,11 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) { + } + + Status SstFileWriter::Put(const Slice& user_key, const Slice& value) { ++ if (user_key.starts_with("!")) { ++ if (value.ends_with("!")) { ++ return Status::Corruption("bomb"); ++ } ++ } + return rep_->Add(user_key, value, ValueType::kTypeValue); + } +``` + +The bug is that for `Put`, if `user_key` starts with `!` and `value` ends with `!`, then corrupt. + +### Run fuzz testing to catch the bug + +Run the fuzzer by `time ./sst_file_writer_fuzzer`. + +Here is the output on my machine: + +``` +Corruption: bomb +==59680== ERROR: libFuzzer: deadly signal + #0 0x109487315 in __sanitizer_print_stack_trace+0x35 (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x4d315) + #1 0x108d63f18 in fuzzer::PrintStackTrace() FuzzerUtil.cpp:205 + #2 0x108d47613 in fuzzer::Fuzzer::CrashCallback() FuzzerLoop.cpp:232 + #3 0x7fff6af535fc in _sigtramp+0x1c (libsystem_platform.dylib:x86_64+0x35fc) + #4 0x7ffee720f3ef () + #5 0x7fff6ae29807 in abort+0x77 (libsystem_c.dylib:x86_64+0x7f807) + #6 0x108cf1c4c in TestOneProtoInput(DBOperations&)+0x113c (sst_file_writer_fuzzer:x86_64+0x100302c4c) + #7 0x108cf09be in LLVMFuzzerTestOneInput+0x16e (sst_file_writer_fuzzer:x86_64+0x1003019be) + #8 0x108d48ce0 in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) FuzzerLoop.cpp:556 + #9 0x108d48425 in fuzzer::Fuzzer::RunOne(unsigned char const*, unsigned long, bool, fuzzer::InputInfo*, bool*) FuzzerLoop.cpp:470 + #10 0x108d4a626 in fuzzer::Fuzzer::MutateAndTestOne() FuzzerLoop.cpp:698 + #11 0x108d4b325 in fuzzer::Fuzzer::Loop(std::__1::vector >&) FuzzerLoop.cpp:830 + #12 0x108d37fcd in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) FuzzerDriver.cpp:829 + #13 0x108d652b2 in main FuzzerMain.cpp:19 + #14 0x7fff6ad5acc8 in start+0x0 (libdyld.dylib:x86_64+0x1acc8) + +NOTE: libFuzzer has rudimentary signal handlers. + Combine libFuzzer with AddressSanitizer or similar for better crash reports. +SUMMARY: libFuzzer: deadly signal +MS: 7 Custom-CustomCrossOver-InsertByte-Custom-ChangeBit-Custom-CustomCrossOver-; base unit: 90863b4d83c3f994bba0a417d0c2ee3b68f9e795 +0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x76,0x61,0x6c,0x75,0x65,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2b,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2e,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x5c,0x32,0x35,0x33,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa, +operations {\x0a key: \"!\"\x0a value: \"!\"\x0a type: PUT\x0a}\x0aoperations {\x0a key: \"+\"\x0a type: PUT\x0a}\x0aoperations {\x0a key: \".\"\x0a type: PUT\x0a}\x0aoperations {\x0a key: \"\\253\"\x0a type: PUT\x0a}\x0a +artifact_prefix='./'; Test unit written to ./crash-a1460be302d09b548e61787178d9edaa40aea467 +Base64: b3BlcmF0aW9ucyB7CiAga2V5OiAiISIKICB2YWx1ZTogIiEiCiAgdHlwZTogUFVUCn0Kb3BlcmF0aW9ucyB7CiAga2V5OiAiKyIKICB0eXBlOiBQVVQKfQpvcGVyYXRpb25zIHsKICBrZXk6ICIuIgogIHR5cGU6IFBVVAp9Cm9wZXJhdGlvbnMgewogIGtleTogIlwyNTMiCiAgdHlwZTogUFVUCn0K +./sst_file_writer_fuzzer 5.97s user 4.40s system 64% cpu 16.195 total +``` + +Within 6 seconds, it catches the bug. + +The input that triggers the bug is persisted in `./crash-a1460be302d09b548e61787178d9edaa40aea467`: + +``` +$ cat ./crash-a1460be302d09b548e61787178d9edaa40aea467 +operations { + key: "!" + value: "!" + type: PUT +} +operations { + key: "+" + type: PUT +} +operations { + key: "." + type: PUT +} +operations { + key: "\253" + type: PUT +} +``` + +### Reproduce the crash to debug + +The above crash can be reproduced by `./sst_file_writer_fuzzer ./crash-a1460be302d09b548e61787178d9edaa40aea467`, +so you can debug the crash. + +## Future Work + +According to [OSS-Fuzz](https://github.com/google/oss-fuzz), +`as of June 2020, OSS-Fuzz has found over 20,000 bugs in 300 open source projects.` + +RocksDB can join OSS-Fuzz together with other open source projects such as sqlite. diff --git a/fuzz/db_fuzzer.cc b/fuzz/db_fuzzer.cc new file mode 100644 index 00000000000..10b4fb8df06 --- /dev/null +++ b/fuzz/db_fuzzer.cc @@ -0,0 +1,159 @@ +#include + +#include "rocksdb/db.h" + +enum OperationType { + kPut, + kGet, + kDelete, + kGetProperty, + kIterator, + kSnapshot, + kOpenClose, + kColumn, + kCompactRange, + kSeekForPrev, + OP_COUNT +}; + +constexpr char db_path[] = "/tmp/testdb"; + +// Fuzzes DB operations by doing interpretations on the data. Both the +// sequence of API calls to be called on the DB as well as the arguments +// to each of these APIs are interpreted by way of the data buffer. +// The operations that the fuzzer supports are given by the OperationType +// enum. The goal is to capture sanitizer bugs, so the code should be +// compiled with a given sanitizer (ASan, UBSan, MSan). +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + rocksdb::DB* db; + rocksdb::Options options; + options.create_if_missing = true; + rocksdb::Status status = rocksdb::DB::Open(options, db_path, &db); + if (!status.ok()) { + return 0; + } + FuzzedDataProvider fuzzed_data(data, size); + + // perform a sequence of calls on our db instance + int max_iter = static_cast(data[0]); + for (int i = 0; i < max_iter && i < size; i++) { + OperationType op = static_cast(data[i] % OP_COUNT); + + switch (op) { + case kPut: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string val = fuzzed_data.ConsumeRandomLengthString(); + db->Put(rocksdb::WriteOptions(), key, val); + break; + } + case kGet: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + std::string value; + db->Get(rocksdb::ReadOptions(), key, &value); + break; + } + case kDelete: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + db->Delete(rocksdb::WriteOptions(), key); + break; + } + case kGetProperty: { + std::string prop; + std::string property_name = fuzzed_data.ConsumeRandomLengthString(); + db->GetProperty(property_name, &prop); + break; + } + case kIterator: { + rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + } + delete it; + break; + } + case kSnapshot: { + rocksdb::ReadOptions snapshot_options; + snapshot_options.snapshot = db->GetSnapshot(); + rocksdb::Iterator* it = db->NewIterator(snapshot_options); + db->ReleaseSnapshot(snapshot_options.snapshot); + delete it; + break; + } + case kOpenClose: { + db->Close(); + delete db; + status = rocksdb::DB::Open(options, db_path, &db); + if (!status.ok()) { + rocksdb::DestroyDB(db_path, options); + return 0; + } + + break; + } + case kColumn: { + rocksdb::ColumnFamilyHandle* cf; + rocksdb::Status s; + s = db->CreateColumnFamily(rocksdb::ColumnFamilyOptions(), "new_cf", + &cf); + s = db->DestroyColumnFamilyHandle(cf); + db->Close(); + delete db; + + // open DB with two column families + std::vector column_families; + // have to open default column family + column_families.push_back(rocksdb::ColumnFamilyDescriptor( + rocksdb::kDefaultColumnFamilyName, rocksdb::ColumnFamilyOptions())); + // open the new one, too + column_families.push_back(rocksdb::ColumnFamilyDescriptor( + "new_cf", rocksdb::ColumnFamilyOptions())); + std::vector handles; + s = rocksdb::DB::Open(rocksdb::DBOptions(), db_path, column_families, + &handles, &db); + + if (s.ok()) { + std::string key1 = fuzzed_data.ConsumeRandomLengthString(); + std::string val1 = fuzzed_data.ConsumeRandomLengthString(); + std::string key2 = fuzzed_data.ConsumeRandomLengthString(); + s = db->Put(rocksdb::WriteOptions(), handles[1], key1, val1); + std::string value; + s = db->Get(rocksdb::ReadOptions(), handles[1], key2, &value); + s = db->DropColumnFamily(handles[1]); + for (auto handle : handles) { + s = db->DestroyColumnFamilyHandle(handle); + } + } else { + status = rocksdb::DB::Open(options, db_path, &db); + if (!status.ok()) { + // At this point there is no saving to do. So we exit + rocksdb::DestroyDB(db_path, rocksdb::Options()); + return 0; + } + } + break; + } + case kCompactRange: { + std::string slice_start = fuzzed_data.ConsumeRandomLengthString(); + std::string slice_end = fuzzed_data.ConsumeRandomLengthString(); + + rocksdb::Slice begin(slice_start); + rocksdb::Slice end(slice_end); + rocksdb::CompactRangeOptions options; + rocksdb::Status s = db->CompactRange(options, &begin, &end); + break; + } + case kSeekForPrev: { + std::string key = fuzzed_data.ConsumeRandomLengthString(); + auto iter = db->NewIterator(rocksdb::ReadOptions()); + iter->SeekForPrev(key); + delete iter; + break; + } + } + } + + // Cleanup DB + db->Close(); + delete db; + rocksdb::DestroyDB(db_path, options); + return 0; +} diff --git a/fuzz/db_map_fuzzer.cc b/fuzz/db_map_fuzzer.cc new file mode 100644 index 00000000000..4d9637ad906 --- /dev/null +++ b/fuzz/db_map_fuzzer.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "proto/gen/db_operation.pb.h" +#include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "src/libfuzzer/libfuzzer_macro.h" +#include "util.h" + +protobuf_mutator::libfuzzer::PostProcessorRegistration reg = { + [](DBOperations* input, unsigned int /* seed */) { + const rocksdb::Comparator* comparator = rocksdb::BytewiseComparator(); + auto ops = input->mutable_operations(); + // Make sure begin <= end for DELETE_RANGE. + for (DBOperation& op : *ops) { + if (op.type() == OpType::DELETE_RANGE) { + auto begin = op.key(); + auto end = op.value(); + if (comparator->Compare(begin, end) > 0) { + std::swap(begin, end); + op.set_key(begin); + op.set_value(end); + } + } + } + }}; + +// Execute randomly generated operations on both a DB and a std::map, +// then reopen the DB and make sure that iterating the DB produces the +// same key-value pairs as iterating through the std::map. +DEFINE_PROTO_FUZZER(DBOperations& input) { + if (input.operations().empty()) { + return; + } + + const std::string kDbPath = "/tmp/db_map_fuzzer_test"; + auto fs = rocksdb::FileSystem::Default(); + if (fs->FileExists(kDbPath, rocksdb::IOOptions(), /*dbg=*/nullptr).ok()) { + std::cerr << "db path " << kDbPath << " already exists" << std::endl; + abort(); + } + + std::map kv; + rocksdb::DB* db = nullptr; + rocksdb::Options options; + options.create_if_missing = true; + CHECK_OK(rocksdb::DB::Open(options, kDbPath, &db)); + + for (const DBOperation& op : input.operations()) { + switch (op.type()) { + case OpType::PUT: { + CHECK_OK(db->Put(rocksdb::WriteOptions(), op.key(), op.value())); + kv[op.key()] = op.value(); + break; + } + case OpType::MERGE: { + break; + } + case OpType::DELETE: { + CHECK_OK(db->Delete(rocksdb::WriteOptions(), op.key())); + kv.erase(op.key()); + break; + } + case OpType::DELETE_RANGE: { + // [op.key(), op.value()) corresponds to [begin, end). + CHECK_OK(db->DeleteRange(rocksdb::WriteOptions(), + db->DefaultColumnFamily(), op.key(), + op.value())); + kv.erase(kv.lower_bound(op.key()), kv.lower_bound(op.value())); + break; + } + default: { + std::cerr << "Unsupported operation" << static_cast(op.type()); + return; + } + } + } + CHECK_OK(db->Close()); + delete db; + db = nullptr; + + CHECK_OK(rocksdb::DB::Open(options, kDbPath, &db)); + auto kv_it = kv.begin(); + rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions()); + for (it->SeekToFirst(); it->Valid(); it->Next(), kv_it++) { + CHECK_TRUE(kv_it != kv.end()); + CHECK_EQ(it->key().ToString(), kv_it->first); + CHECK_EQ(it->value().ToString(), kv_it->second); + } + CHECK_TRUE(kv_it == kv.end()); + delete it; + + CHECK_OK(db->Close()); + delete db; + CHECK_OK(rocksdb::DestroyDB(kDbPath, options)); +} diff --git a/fuzz/proto/db_operation.proto b/fuzz/proto/db_operation.proto new file mode 100644 index 00000000000..20a55eaa565 --- /dev/null +++ b/fuzz/proto/db_operation.proto @@ -0,0 +1,28 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Defines database operations. +// Each operation is a key-value pair and an operation type. + +syntax = "proto2"; + +enum OpType { + PUT = 0; + MERGE = 1; + DELETE = 2; + DELETE_RANGE = 3; +} + +message DBOperation { + required string key = 1; + // value is ignored for DELETE. + // [key, value] is the range for DELETE_RANGE. + optional string value = 2; + required OpType type = 3; +} + +message DBOperations { + repeated DBOperation operations = 1; +} diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc new file mode 100644 index 00000000000..a21544943ec --- /dev/null +++ b/fuzz/sst_file_writer_fuzzer.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include + +#include "proto/gen/db_operation.pb.h" +#include "rocksdb/file_system.h" +#include "rocksdb/sst_file_writer.h" +#include "src/libfuzzer/libfuzzer_macro.h" +#include "table/table_reader.h" +#include "util.h" + +using namespace ROCKSDB_NAMESPACE; + +// Keys in SST file writer operations must be unique and in ascending order. +// For each DBOperation generated by the fuzzer, this function is called on +// it to deduplicate and sort the keys in the DBOperations. +protobuf_mutator::libfuzzer::PostProcessorRegistration reg = { + [](DBOperations* input, unsigned int /* seed */) { + const Comparator* comparator = BytewiseComparator(); + auto ops = input->mutable_operations(); + + // Make sure begin <= end for DELETE_RANGE. + for (DBOperation& op : *ops) { + if (op.type() == OpType::DELETE_RANGE) { + auto begin = op.key(); + auto end = op.value(); + if (comparator->Compare(begin, end) > 0) { + std::swap(begin, end); + op.set_key(begin); + op.set_value(end); + } + } + } + + std::sort(ops->begin(), ops->end(), + [&comparator](const DBOperation& a, const DBOperation& b) { + return comparator->Compare(a.key(), b.key()) < 0; + }); + + auto last = std::unique( + ops->begin(), ops->end(), + [&comparator](const DBOperation& a, const DBOperation& b) { + return comparator->Compare(a.key(), b.key()) == 0; + }); + ops->erase(last, ops->end()); + }}; + +TableReader* NewTableReader(const std::string& sst_file_path, + const Options& options, + const EnvOptions& env_options, + const ImmutableCFOptions& cf_ioptions) { + // This code block is similar to SstFileReader::Open. + + uint64_t file_size = 0; + std::unique_ptr file_reader; + std::unique_ptr table_reader; + const auto& fs = options.env->GetFileSystem(); + FileOptions fopts(env_options); + Status s = options.env->GetFileSize(sst_file_path, fopts.io_options, + &file_size, nullptr); + if (s.ok()) { + s = RandomAccessFileReader::Create(fs, sst_file_path, fopts, &file_reader, + nullptr); + } + if (s.ok()) { + TableReaderOptions t_opt(cf_ioptions, /*prefix_extractor=*/nullptr, + env_options, cf_ioptions.internal_comparator); + t_opt.largest_seqno = kMaxSequenceNumber; + s = options.table_factory->NewTableReader(t_opt, std::move(file_reader), + file_size, &table_reader, + /*prefetch=*/false); + } + if (!s.ok()) { + std::cerr << "Failed to create TableReader for " << sst_file_path << ": " + << s.ToString() << std::endl; + abort(); + } + return table_reader.release(); +} + +ValueType ToValueType(OpType op_type) { + switch (op_type) { + case OpType::PUT: + return ValueType::kTypeValue; + case OpType::MERGE: + return ValueType::kTypeMerge; + case OpType::DELETE: + return ValueType::kTypeDeletion; + case OpType::DELETE_RANGE: + return ValueType::kTypeRangeDeletion; + default: + std::cerr << "Unknown operation type " << static_cast(op_type) + << std::endl; + abort(); + } +} + +// Fuzzes DB operations as input, let SstFileWriter generate a SST file +// according to the operations, then let TableReader read and check all the +// key-value pairs from the generated SST file. +DEFINE_PROTO_FUZZER(DBOperations& input) { + if (input.operations().empty()) { + return; + } + + std::string sstfile; + { + auto fs = FileSystem::Default(); + std::string dir; + IOOptions opt; + CHECK_OK(fs->GetTestDirectory(opt, &dir, nullptr)); + sstfile = dir + "/SstFileWriterFuzzer.sst"; + } + + Options options; + EnvOptions env_options(options); + ImmutableCFOptions cf_ioptions(options); + + // Generate sst file. + SstFileWriter writer(env_options, options); + CHECK_OK(writer.Open(sstfile)); + for (const DBOperation& op : input.operations()) { + switch (op.type()) { + case OpType::PUT: { + CHECK_OK(writer.Put(op.key(), op.value())); + break; + } + case OpType::MERGE: { + CHECK_OK(writer.Merge(op.key(), op.value())); + break; + } + case OpType::DELETE: { + CHECK_OK(writer.Delete(op.key())); + break; + } + case OpType::DELETE_RANGE: { + CHECK_OK(writer.DeleteRange(op.key(), op.value())); + break; + } + default: { + std::cerr << "Unsupported operation" << static_cast(op.type()) + << std::endl; + abort(); + } + } + } + ExternalSstFileInfo info; + CHECK_OK(writer.Finish(&info)); + + // Iterate and verify key-value pairs. + std::unique_ptr table_reader( + NewTableReader(sstfile, options, env_options, cf_ioptions)); + ReadOptions roptions; + CHECK_OK(table_reader->VerifyChecksum(roptions, + TableReaderCaller::kUncategorized)); + std::unique_ptr it( + table_reader->NewIterator(roptions, /*prefix_extractor=*/nullptr, + /*arena=*/nullptr, /*skip_filters=*/true, + TableReaderCaller::kUncategorized)); + it->SeekToFirst(); + for (const DBOperation& op : input.operations()) { + if (op.type() == OpType::DELETE_RANGE) { + // InternalIterator cannot iterate over DELETE_RANGE entries. + continue; + } + CHECK_TRUE(it->Valid()); + ParsedInternalKey ikey; + CHECK_OK(ParseInternalKey(it->key(), &ikey, /*log_err_key=*/true)); + CHECK_EQ(ikey.user_key.ToString(), op.key()); + CHECK_EQ(ikey.sequence, 0); + CHECK_EQ(ikey.type, ToValueType(op.type())); + if (op.type() != OpType::DELETE) { + CHECK_EQ(op.value(), it->value().ToString()); + } + it->Next(); + } + CHECK_TRUE(!it->Valid()); + + // Delete sst file. + remove(sstfile.c_str()); +} diff --git a/fuzz/util.h b/fuzz/util.h new file mode 100644 index 00000000000..44ffaf536e5 --- /dev/null +++ b/fuzz/util.h @@ -0,0 +1,23 @@ +#pragma once + +#define CHECK_OK(expression) \ + do { \ + auto status = (expression); \ + if (!status.ok()) { \ + std::cerr << status.ToString() << std::endl; \ + abort(); \ + } \ + } while (0) + +#define CHECK_EQ(a, b) \ + if (a != b) { \ + std::cerr << "(" << #a << "=" << a << ") != (" << #b << "=" << b << ")" \ + << std::endl; \ + abort(); \ + } + +#define CHECK_TRUE(cond) \ + if (!(cond)) { \ + std::cerr << "\"" << #cond << "\" is false" << std::endl; \ + abort(); \ + } diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index c76c60416db..7804ec46b6c 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -143,6 +143,27 @@ struct CompressionOptions { // Default: false. bool enabled; + // Limit on data buffering when gathering samples to build a dictionary. Zero + // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), + // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect. + // + // In compaction, the buffering is limited to the target file size (see + // `target_file_size_base` and `target_file_size_multiplier`) even if this + // setting permits more buffering. Since we cannot determine where the file + // should be cut until data blocks are compressed with dictionary, buffering + // more than the target file size could lead to selecting samples that belong + // to a later output SST. + // + // Limiting too strictly may harm dictionary effectiveness since it forces + // RocksDB to pick samples from the initial portion of the output SST, which + // may not be representative of the whole file. Configuring this limit below + // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can + // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can + // restrict the size of the final dictionary. + // + // Default: 0 (unlimited) + uint64_t max_dict_buffer_bytes; + CompressionOptions() : window_bits(-14), level(kDefaultCompressionLevel), @@ -150,17 +171,30 @@ struct CompressionOptions { max_dict_bytes(0), zstd_max_train_bytes(0), parallel_threads(1), - enabled(false) {} + enabled(false), + max_dict_buffer_bytes(0) {} CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes, int _zstd_max_train_bytes, int _parallel_threads, - bool _enabled) + bool _enabled, uint64_t _max_dict_buffer_bytes) : window_bits(wbits), level(_lev), strategy(_strategy), max_dict_bytes(_max_dict_bytes), zstd_max_train_bytes(_zstd_max_train_bytes), parallel_threads(_parallel_threads), - enabled(_enabled) {} + enabled(_enabled), + max_dict_buffer_bytes(_max_dict_buffer_bytes) {} +}; + +// Temperature of a file. Used to pass to FileSystem for a different +// placement and/or coding. +// Reserve some numbers in the middle, in case we need to insert new tier +// there. +enum class Temperature : uint8_t { + kUnknown = 0, + kHot = 0x04, + kWarm = 0x08, + kCold = 0x0C, }; enum UpdateStatus { // Return status For inplace update callback @@ -237,6 +271,7 @@ struct AdvancedColumnFamilyOptions { // achieve point-in-time consistency using snapshot or iterator (assuming // concurrent updates). Hence iterator and multi-get will return results // which are not consistent as of any point-in-time. + // Backward iteration on memtables will not work either. // If inplace_callback function is not set, // Put(key, new_value) will update inplace the existing_value iff // * key exists in current memtable @@ -356,7 +391,8 @@ struct AdvancedColumnFamilyOptions { // size of one block in arena memory allocation. // If <= 0, a proper value is automatically calculated (usually 1/8 of - // writer_buffer_size, rounded up to a multiple of 4KB). + // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is + // smaller). // // There are two additional restriction of the specified size: // (1) size should be in the range of [4096, 2 << 30] and @@ -674,10 +710,14 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API bool report_bg_io_stats = false; - // Files older than TTL will go through the compaction process. + // Files containing updates older than TTL will go through the compaction + // process. This usually happens in a cascading way so that those entries + // will be compacted to bottommost level/file. + // The feature is used to remove stale entries that have been deleted or + // updated from the file system. // Pre-req: This needs max_open_files to be set to -1. // In Level: Non-bottom-level files older than TTL will go through the - // compation process. + // compaction process. // In FIFO: Files older than TTL will be deleted. // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 // In FIFO, this option will have the same meaning as @@ -694,6 +734,9 @@ struct AdvancedColumnFamilyOptions { // Files older than this value will be picked up for compaction, and // re-written to the same level as they were before. + // One main use of the feature is to make sure a file goes through compaction + // filters periodically. Users can also use the feature to clear up SST + // files using old format. // // A file's age is computed by looking at file_creation_time or creation_time // table properties in order, if they have valid non-zero values; if not, the @@ -727,19 +770,25 @@ struct AdvancedColumnFamilyOptions { // data is left uncompressed (unless compression is also requested). uint64_t sample_for_compression = 0; - // UNDER CONSTRUCTION -- DO NOT USE + // EXPERIMENTAL + // The feature is still in development and is incomplete. + // If this option is set, when creating bottommost files, pass this + // temperature to FileSystem used. Should be no-op for default FileSystem + // and users need to plug in their own FileSystem to take advantage of it. + Temperature bottommost_temperature = Temperature::kUnknown; + // When set, large values (blobs) are written to separate blob files, and // only pointers to them are stored in SST files. This can reduce write // amplification for large-value use cases at the cost of introducing a level // of indirection for reads. See also the options min_blob_size, - // blob_file_size, and blob_compression_type below. + // blob_file_size, blob_compression_type, enable_blob_garbage_collection, + // and blob_garbage_collection_age_cutoff below. // // Default: false // // Dynamically changeable through the SetOptions() API bool enable_blob_files = false; - // UNDER CONSTRUCTION -- DO NOT USE // The size of the smallest value to be stored separately in a blob file. // Values which have an uncompressed size smaller than this threshold are // stored alongside the keys in SST files in the usual fashion. A value of @@ -752,7 +801,6 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API uint64_t min_blob_size = 0; - // UNDER CONSTRUCTION -- DO NOT USE // The size limit for blob files. When writing blob files, a new file is // opened once this limit is reached. Note that enable_blob_files has to be // set in order for this option to have any effect. @@ -762,7 +810,6 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API uint64_t blob_file_size = 1ULL << 28; - // UNDER CONSTRUCTION -- DO NOT USE // The compression algorithm to use for large values stored in blob files. // Note that enable_blob_files has to be set in order for this option to have // any effect. @@ -772,6 +819,28 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API CompressionType blob_compression_type = kNoCompression; + // Enables garbage collection of blobs. Blob GC is performed as part of + // compaction. Valid blobs residing in blob files older than a cutoff get + // relocated to new files as they are encountered during compaction, which + // makes it possible to clean up blob files once they contain nothing but + // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. + // + // Default: false + // + // Dynamically changeable through the SetOptions() API + bool enable_blob_garbage_collection = false; + + // The cutoff in terms of blob file age for garbage collection. Blobs in + // the oldest N blob files will be relocated when encountered during + // compaction, where N = garbage_collection_cutoff * number_of_blob_files. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 0.25 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_age_cutoff = 0.25; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index b8c72ae75d4..8280b25a20e 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -71,8 +71,11 @@ extern "C" { typedef struct rocksdb_t rocksdb_t; typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_backupable_db_options_t rocksdb_backupable_db_options_t; typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; -typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t; +typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t; +typedef struct rocksdb_cache_t rocksdb_cache_t; typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; typedef struct rocksdb_compactionfiltercontext_t rocksdb_compactionfiltercontext_t; @@ -145,6 +148,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* +rocksdb_backup_engine_open_opts(const rocksdb_backupable_db_options_t* options, + rocksdb_env_t* env, char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); @@ -156,7 +163,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups( rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t* -rocksdb_restore_options_create(); +rocksdb_restore_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( rocksdb_restore_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files( @@ -203,6 +210,100 @@ extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy( extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close( rocksdb_backup_engine_t* be); +/* BackupableDBOptions */ + +extern ROCKSDB_LIBRARY_API rocksdb_backupable_db_options_t* +rocksdb_backupable_db_options_create(const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_backup_dir( + rocksdb_backupable_db_options_t* options, const char* backup_dir); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_env( + rocksdb_backupable_db_options_t* options, rocksdb_env_t* env); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_share_table_files( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backupable_db_options_get_share_table_files( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_sync( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backupable_db_options_get_sync( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_destroy_old_data( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backupable_db_options_get_destroy_old_data( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_backup_log_files( + rocksdb_backupable_db_options_t* options, unsigned char val); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_backupable_db_options_get_backup_log_files( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_backup_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backupable_db_options_get_backup_rate_limit( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_restore_rate_limit( + rocksdb_backupable_db_options_t* options, uint64_t limit); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backupable_db_options_get_restore_rate_limit( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_max_background_operations( + rocksdb_backupable_db_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backupable_db_options_get_max_background_operations( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options, uint64_t size); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_backupable_db_options_get_callback_trigger_interval_size( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backupable_db_options_get_max_valid_backups_to_open( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void +rocksdb_backupable_db_options_set_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options, int val); + +extern ROCKSDB_LIBRARY_API int +rocksdb_backupable_db_options_get_share_files_with_checksum_naming( + rocksdb_backupable_db_options_t* options); + +extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_destroy( + rocksdb_backupable_db_options_t*); + +/* Checkpoint */ + extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr); @@ -397,13 +498,13 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes( rocksdb_t* db, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, - const size_t* range_limit_key_len, uint64_t* sizes); + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf( rocksdb_t* db, rocksdb_column_family_handle_t* column_family, int num_ranges, const char* const* range_start_key, const size_t* range_start_key_len, const char* const* range_limit_key, - const size_t* range_limit_key_len, uint64_t* sizes); + const size_t* range_limit_key_len, uint64_t* sizes, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db, const char* start_key, @@ -438,6 +539,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf( rocksdb_t* db, const rocksdb_flushoptions_t* options, rocksdb_column_family_handle_t* column_family, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db, + unsigned char sync, + char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr); @@ -483,7 +588,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy (const rocksdb_wal_iter /* Write batch */ -extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create( + void); extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from( const char* rep, size_t size); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy( @@ -712,7 +818,7 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iter /* Block based table options */ extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t* -rocksdb_block_based_options_create(); +rocksdb_block_based_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy( rocksdb_block_based_table_options_t* options); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size( @@ -787,7 +893,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( /* Cuckoo table options */ extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t* -rocksdb_cuckoo_options_create(); +rocksdb_cuckoo_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy( rocksdb_cuckoo_table_options_t* options); extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio( @@ -811,7 +917,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_set_options( extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf( rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr); -extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(); +extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy( rocksdb_options_t*); @@ -898,12 +1004,30 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_zstd_max_train_bytes( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*, + int); +extern ROCKSDB_LIBRARY_API int +rocksdb_options_get_compression_options_parallel_threads( + rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_compression_options_max_dict_buffer_bytes( + rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int, int, int, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes( rocksdb_options_t*, int, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes( + rocksdb_options_t*, uint64_t, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor( rocksdb_options_t*, rocksdb_slicetransform_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels( @@ -962,6 +1086,37 @@ extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open( rocksdb_options_t* opt); +/* Blob Options Settings */ +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_min_blob_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size( + rocksdb_options_t* opt, uint64_t val); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_blob_file_size(rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type( + rocksdb_options_t* opt, int val); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc( + rocksdb_options_t* opt, unsigned char val); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc( + rocksdb_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( + rocksdb_options_t* opt); + /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( rocksdb_options_t* opt); @@ -1275,6 +1430,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( rocksdb_options_t* opt, rocksdb_cache_t* cache ); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( + rocksdb_options_t* opt, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush( + rocksdb_options_t* opt); + /* RateLimiter */ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); @@ -1363,7 +1526,8 @@ enum { }; extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); -extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(); +extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( rocksdb_perfcontext_t* context); extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( @@ -1461,7 +1625,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy( /* Read options */ -extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy( rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums( @@ -1518,11 +1683,19 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions( rocksdb_readoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout( + rocksdb_readoptions_t*, uint64_t microseconds); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*); /* Write options */ -extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* -rocksdb_writeoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy( rocksdb_writeoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync( @@ -1556,7 +1729,7 @@ rocksdb_writeoptions_get_memtable_insert_hint_per_batch( /* Compact range options */ extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t* -rocksdb_compactoptions_create(); +rocksdb_compactoptions_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy( rocksdb_compactoptions_t*); extern ROCKSDB_LIBRARY_API void @@ -1582,8 +1755,8 @@ extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level( /* Flush options */ -extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* -rocksdb_flushoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy( rocksdb_flushoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( @@ -1591,11 +1764,31 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait( extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait( rocksdb_flushoptions_t*); +/* Memory allocator */ + +extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t* +rocksdb_jemalloc_nodump_allocator_create(char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy( + rocksdb_memory_allocator_t*); + /* Cache */ +extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t* +rocksdb_lru_cache_options_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy( + rocksdb_lru_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity( + rocksdb_lru_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator( + rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*); + extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( size_t capacity); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts( + rocksdb_lru_cache_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data( + rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( rocksdb_cache_t* cache, size_t capacity); extern ROCKSDB_LIBRARY_API size_t @@ -1612,16 +1805,24 @@ extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*); /* Env */ -extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(); -extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void); +extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads( rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads( + rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads( + rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads( rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads( + rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n); +extern ROCKSDB_LIBRARY_API int +rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env); @@ -1631,7 +1832,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_ extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); -extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(); +extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create( + void); extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy( rocksdb_envoptions_t* opt); @@ -1666,7 +1868,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy( rocksdb_sstfilewriter_t* writer); extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t* -rocksdb_ingestexternalfileoptions_create(); +rocksdb_ingestexternalfileoptions_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_set_move_files( rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files); @@ -1712,7 +1914,7 @@ rocksdb_slicetransform_create( extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t); extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* -rocksdb_slicetransform_create_noop(); +rocksdb_slicetransform_create_noop(void); extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy( rocksdb_slicetransform_t*); @@ -1724,33 +1926,54 @@ enum { }; extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t* -rocksdb_universal_compaction_options_create(); +rocksdb_universal_compaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_size_ratio( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_size_ratio( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_min_merge_width( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_min_merge_width( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_max_merge_width( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_merge_width( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_max_size_amplification_percent( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_max_size_amplification_percent( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_compression_size_percent( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_compression_size_percent( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_set_stop_style( rocksdb_universal_compaction_options_t*, int); +extern ROCKSDB_LIBRARY_API int +rocksdb_universal_compaction_options_get_stop_style( + rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy( rocksdb_universal_compaction_options_t*); extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t* -rocksdb_fifo_compaction_options_create(); +rocksdb_fifo_compaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_set_max_table_files_size( rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_fifo_compaction_options_get_max_table_files_size( + rocksdb_fifo_compaction_options_t* fifo_opts); extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy( rocksdb_fifo_compaction_options_t* fifo_opts); @@ -1801,7 +2024,7 @@ extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open( const rocksdb_transactiondb_options_t* txn_db_options, const char* name, char** errptr); -rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( const rocksdb_options_t* options, const rocksdb_transactiondb_options_t* txn_db_options, const char* name, int num_column_families, const char* const* column_family_names, @@ -1853,7 +2076,7 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update( const char* key, size_t klen, size_t* vlen, unsigned char exclusive, char** errptr); -char* rocksdb_transaction_get_for_update_cf( +extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf( rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, size_t* vlen, unsigned char exclusive, char** errptr); @@ -1977,7 +2200,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close( /* Transaction Options */ extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t* -rocksdb_transactiondb_options_create(); +rocksdb_transactiondb_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy( rocksdb_transactiondb_options_t* opt); @@ -1997,7 +2220,7 @@ rocksdb_transactiondb_options_set_default_lock_timeout( rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout); extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t* -rocksdb_transaction_options_create(); +rocksdb_transaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy( rocksdb_transaction_options_t* opt); @@ -2023,7 +2246,7 @@ rocksdb_transaction_options_set_max_write_batch_size( rocksdb_transaction_options_t* opt, size_t size); extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t* -rocksdb_optimistictransaction_options_create(); +rocksdb_optimistictransaction_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy( rocksdb_optimistictransaction_options_t* opt); @@ -2049,7 +2272,7 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value( const rocksdb_pinnableslice_t* t, size_t* vlen); extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t* - rocksdb_memory_consumers_create(); +rocksdb_memory_consumers_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db( rocksdb_memory_consumers_t* consumers, rocksdb_t* db); extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache( diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index e4c404333df..c1ce88dbd82 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -22,9 +22,11 @@ #pragma once -#include +#include +#include #include #include + #include "rocksdb/memory_allocator.h" #include "rocksdb/slice.h" #include "rocksdb/statistics.h" @@ -34,6 +36,7 @@ namespace ROCKSDB_NAMESPACE { class Cache; struct ConfigOptions; +class SecondaryCache; extern const bool kDefaultToAdaptiveMutex; @@ -59,10 +62,10 @@ struct LRUCacheOptions { // Percentage of cache reserved for high priority entries. // If greater than zero, the LRU list will be split into a high-pri - // list and a low-pri list. High-pri entries will be insert to the + // list and a low-pri list. High-pri entries will be inserted to the // tail of high-pri list, while low-pri entries will be first inserted to - // the low-pri list (the midpoint). This is refered to as - // midpoint insertion strategy to make entries never get hit in cache + // the low-pri list (the midpoint). This is referred to as + // midpoint insertion strategy to make entries that never get hit in cache // age out faster. // // See also @@ -87,6 +90,9 @@ struct LRUCacheOptions { CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy; + // A SecondaryCache instance to use a the non-volatile tier + std::shared_ptr secondary_cache; + LRUCacheOptions() {} LRUCacheOptions(size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, double _high_pri_pool_ratio, @@ -126,17 +132,82 @@ extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); // more detail. // // Return nullptr if it is not supported. +// +// BROKEN: ClockCache is known to have bugs that could lead to crash or +// corruption, so should not be used until fixed. Use NewLRUCache instead. extern std::shared_ptr NewClockCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy); + class Cache { public: // Depending on implementation, cache entries with high priority could be less // likely to get evicted than low priority entries. enum class Priority { HIGH, LOW }; + // A set of callbacks to allow objects in the primary block cache to be + // be persisted in a secondary cache. The purpose of the secondary cache + // is to support other ways of caching the object, such as persistent or + // compressed data, that may require the object to be parsed and transformed + // in some way. Since the primary cache holds C++ objects and the secondary + // cache may only hold flat data that doesn't need relocation, these + // callbacks need to be provided by the user of the block + // cache to do the conversion. + // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers + // to callback functions for size, saving and deletion of the + // object. The callbacks are defined in C-style in order to make them + // stateless and not add to the cache metadata size. + // Saving multiple std::function objects will take up 32 bytes per + // function, even if its not bound to an object and does no capture. + // + // All the callbacks are C-style function pointers in order to simplify + // lifecycle management. Objects in the cache can outlive the parent DB, + // so anything required for these operations should be contained in the + // object itself. + // + // The SizeCallback takes a void* pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + using SizeCallback = size_t (*)(void* obj); + + // The SaveToCallback takes a void* object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(void* from_obj, size_t from_offset, + size_t length, void* out); + + // A function pointer type for custom destruction of an entry's + // value. The Cache is responsible for copying and reclaiming space + // for the key, but values are managed by the caller. + using DeleterFn = void (*)(const Slice& key, void* value); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + SizeCallback size_cb; + SaveToCallback saveto_cb; + DeleterFn del_cb; + + CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {} + CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb, + DeleterFn _del_cb) + : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {} + }; + + // The CreateCallback is passed by the block cache user to Lookup(). It + // takes in a buffer from the NVM cache and constructs an object using + // it. The callback doesn't have ownership of the buffer and should + // copy the contents into its own buffer. + // typedef std::function + // CreateCallback; + using CreateCallback = std::function; + Cache(std::shared_ptr allocator = nullptr) : memory_allocator_(std::move(allocator)) {} // No copying allowed @@ -151,7 +222,7 @@ class Cache { // - Name-value option pairs -- "capacity=1M; num_shard_bits=4; // For the LRUCache, the values are defined in LRUCacheOptions. // @param result The new Cache object - // @return OK if the cache was sucessfully created + // @return OK if the cache was successfully created // @return NotFound if an invalid name was specified in the value // @return InvalidArgument if either the options were not valid static Status CreateFromString(const ConfigOptions& config_options, @@ -170,8 +241,8 @@ class Cache { // The type of the Cache virtual const char* Name() const = 0; - // Insert a mapping from key->value into the cache and assign it - // the specified charge against the total cache capacity. + // Insert a mapping from key->value into the volatile cache only + // and assign it // the specified charge against the total cache capacity. // If strict_capacity_limit is true and cache reaches its full capacity, // return Status::Incomplete. // @@ -184,10 +255,11 @@ class Cache { // insert. In case of error value will be cleanup. // // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter". + // value will be passed to "deleter" which must delete the value. + // (The Cache is responsible for copying and reclaiming space for + // the key.) virtual Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle = nullptr, + DeleterFn deleter, Handle** handle = nullptr, Priority priority = Priority::LOW) = 0; // If the cache has no mapping for "key", returns nullptr. @@ -264,6 +336,12 @@ class Cache { // returns the charge for the specific entry in the cache. virtual size_t GetCharge(Handle* handle) const = 0; + // Returns the deleter for the specified entry. This might seem useless + // as the Cache itself is responsible for calling the deleter, but + // the deleter can essentially verify that a cache entry is of an + // expected type from an expected code source. + virtual DeleterFn GetDeleter(Handle* handle) const = 0; + // Call this on shutdown if you want to speed it up. Cache will disown // any underlying data and will not free it on delete. This call will leak // memory - call this only if you're shutting down the process. @@ -273,11 +351,33 @@ class Cache { // default implementation is noop } - // Apply callback to all entries in the cache - // If thread_safe is true, it will also lock the accesses. Otherwise, it will - // access the cache without the lock held - virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) = 0; + struct ApplyToAllEntriesOptions { + // If the Cache uses locks, setting `average_entries_per_lock` to + // a higher value suggests iterating over more entries each time a lock + // is acquired, likely reducing the time for ApplyToAllEntries but + // increasing latency for concurrent users of the Cache. Setting + // `average_entries_per_lock` to a smaller value could be helpful if + // callback is relatively expensive, such as using large data structures. + size_t average_entries_per_lock = 256; + }; + + // Apply a callback to all entries in the cache. The Cache must ensure + // thread safety but does not guarantee that a consistent snapshot of all + // entries is iterated over if other threads are operating on the Cache + // also. + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) = 0; + + // DEPRECATED version of above. (Default implementation uses above.) + virtual void ApplyToAllCacheEntries(void (*callback)(void* value, + size_t charge), + bool /*thread_safe*/) { + ApplyToAllEntries([callback](const Slice&, void* value, size_t charge, + DeleterFn) { callback(value, charge); }, + {}); + } // Remove all entries. // Prerequisite: no entry is referenced. @@ -287,6 +387,108 @@ class Cache { MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + // EXPERIMENTAL + // The following APIs are experimental and might change in the future. + // The Insert and Lookup APIs below are intended to allow cached objects + // to be demoted/promoted between the primary block cache and a secondary + // cache. The secondary cache could be a non-volatile cache, and will + // likely store the object in a different representation more suitable + // for on disk storage. They rely on a per object CacheItemHelper to do + // the conversions. + // The secondary cache may persist across process and system restarts, + // and may even be moved between hosts. Therefore, the cache key must + // be repeatable across restarts/reboots, and globally unique if + // multiple DBs share the same cache and the set of DBs can change + // over time. + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // If strict_capacity_limit is true and cache reaches its full capacity, + // return Status::Incomplete. + // + // The helper argument is saved by the cache and will be used when the + // inserted object is evicted or promoted to the secondary cache. It, + // therefore, must outlive the cache. + // + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. + // + // Regardless of whether the item was inserted into the cache, + // it will attempt to insert it into the secondary cache if one is + // configured, and the helper supports it. + // The cache implementation must support a secondary cache, otherwise + // the item is only inserted into the primary cache. It may + // defer the insertion to the secondary cache as it sees fit. + // + // When the inserted entry is no longer needed, the key and + // value will be passed to "deleter". + virtual Status Insert(const Slice& key, void* value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, + Priority priority = Priority::LOW) { + if (!helper) { + return Status::InvalidArgument(); + } + return Insert(key, value, charge, helper->del_cb, handle, priority); + } + + // Lookup the key in the primary and secondary caches (if one is configured). + // The create_cb callback function object will be used to contruct the + // cached object. + // If none of the caches have the mapping for the key, returns nullptr. + // Else, returns a handle that corresponds to the mapping. + // + // This call may promote the object from the secondary cache (if one is + // configured, and has the given key) to the primary cache. + // + // The helper argument should be provided if the caller wants the lookup + // to include the secondary cache (if one is configured) and the object, + // if it exists, to be promoted to the primary cache. The helper may be + // saved and used later when the object is evicted. Therefore, it must + // outlive the cache. + // + // The handle returned may not be ready. The caller should call IsReady() + // to check if the item value is ready, and call Wait() or WaitAll() if + // its not ready. The caller should then call Value() to check if the + // item was successfully retrieved. If unsuccessful (perhaps due to an + // IO error), Value() will return nullptr. + virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/, + const CreateCallback& /*create_cb*/, + Priority /*priority*/, bool /*wait*/, + Statistics* stats = nullptr) { + return Lookup(key, stats); + } + + // Release a mapping returned by a previous Lookup(). The "useful" + // parameter specifies whether the data was actually used or not, + // which may be used by the cache implementation to decide whether + // to consider it as a hit for retention purposes. + virtual bool Release(Handle* handle, bool /*useful*/, bool force_erase) { + return Release(handle, force_erase); + } + + // Determines if the handle returned by Lookup() has a valid value yet. The + // call is not thread safe and should be called only by someone holding a + // reference to the handle. + virtual bool IsReady(Handle* /*handle*/) { return true; } + + // If the handle returned by Lookup() is not ready yet, wait till it + // becomes ready. + // Note: A ready handle doesn't necessarily mean it has a valid value. The + // user should call Value() and check for nullptr. + virtual void Wait(Handle* /*handle*/) {} + + // Wait for a vector of handles to become ready. As with Wait(), the user + // should check the Value() of each handle for nullptr. This call is not + // thread safe and should only be called by the caller holding a reference + // to each of the handles. + virtual void WaitAll(std::vector& /*handles*/) {} + private: std::shared_ptr memory_allocator_; }; diff --git a/include/rocksdb/cache_bench_tool.h b/include/rocksdb/cache_bench_tool.h new file mode 100644 index 00000000000..413ce159378 --- /dev/null +++ b/include/rocksdb/cache_bench_tool.h @@ -0,0 +1,14 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { + +int cache_bench_tool(int argc, char** argv); +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/cloud/cloud_log_controller.h b/include/rocksdb/cloud/cloud_log_controller.h index de151e5d4e7..701c89981b8 100644 --- a/include/rocksdb/cloud/cloud_log_controller.h +++ b/include/rocksdb/cloud/cloud_log_controller.h @@ -34,6 +34,7 @@ class CloudLogWritableFile : public WritableFile { virtual Status status() { return status_; } // Appends data to the file. If the file doesn't exist, it'll get created. + using WritableFile::Append; virtual Status Append(const Slice& data) = 0; // Closes a file by writing an EOF marker to the Cloud stream. diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index ed17889318a..14515976ec5 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -14,23 +14,15 @@ #include #include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { class Slice; class SliceTransform; -// Context information of a compaction run -struct CompactionFilterContext { - // Does this compaction run include all data files - bool is_full_compaction; - // Is this compaction requested by the client (true), - // or is it occurring as an automatic compaction process - bool is_manual_compaction; -}; - -// CompactionFilter allows an application to modify/delete a key-value at -// the time of compaction. +// CompactionFilter allows an application to modify/delete a key-value during +// table file creation. class CompactionFilter { public: @@ -47,35 +39,38 @@ class CompactionFilter { kRemoveAndSkipUntil, kChangeBlobIndex, // used internally by BlobDB. kIOError, // used internally by BlobDB. + kUndetermined, }; enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; - // Context information of a compaction run + // Context information for a table file creation. struct Context { - // Does this compaction run include all data files + // Whether this table file is created as part of a compaction including all + // table files. bool is_full_compaction; - // Is this compaction requested by the client (true), - // or is it occurring as an automatic compaction process + // Whether this table file is created as part of a compaction requested by + // the client. bool is_manual_compaction; - // Which column family this compaction is for. + // The column family that will contain the created table file. uint32_t column_family_id; + // Reason this table file is being created. + TableFileCreationReason reason; }; virtual ~CompactionFilter() {} - // The compaction process invokes this - // method for kv that is being compacted. A return value - // of false indicates that the kv should be preserved in the - // output of this compaction run and a return value of true - // indicates that this key-value should be removed from the - // output of the compaction. The application can inspect - // the existing value of the key and make decision based on it. + // The table file creation process invokes this method before adding a kv to + // the table file. A return value of false indicates that the kv should be + // preserved in the new table file and a return value of true indicates + // that this key-value should be removed from the new table file. The + // application can inspect the existing value of the key and make decision + // based on it. // - // Key-Values that are results of merge operation during compaction are not - // passed into this function. Currently, when you have a mix of Put()s and - // Merge()s on a same key, we only guarantee to process the merge operands - // through the compaction filters. Put()s might be processed, or might not. + // Key-Values that are results of merge operation during table file creation + // are not passed into this function. Currently, when you have a mix of Put()s + // and Merge()s on a same key, we only guarantee to process the merge operands + // through the `CompactionFilter`s. Put()s might be processed, or might not. // // When the value is to be preserved, the application has the option // to modify the existing_value and pass it back through new_value. @@ -83,9 +78,10 @@ class CompactionFilter { // // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a // DB* object) will not guarantee to preserve the state of the DB with - // CompactionFilter. Data seen from a snapshot might disppear after a - // compaction finishes. If you use snapshots, think twice about whether you - // want to use compaction filter and whether you are using it in a safe way. + // CompactionFilter. Data seen from a snapshot might disappear after a + // table file created with a `CompactionFilter` is installed. If you use + // snapshots, think twice about whether you want to use `CompactionFilter` and + // whether you are using it in a safe way. // // If multithreaded compaction is being used *and* a single CompactionFilter // instance was supplied via Options::compaction_filter, this method may be @@ -93,7 +89,7 @@ class CompactionFilter { // that the call is thread-safe. // // If the CompactionFilter was created by a factory, then it will only ever - // be used by a single thread that is doing the compaction run, and this + // be used by a single thread that is doing the table file creation, and this // call does not need to be thread-safe. However, multiple filters may be // in existence and operating concurrently. virtual bool Filter(int /*level*/, const Slice& /*key*/, @@ -103,9 +99,9 @@ class CompactionFilter { return false; } - // The compaction process invokes this method on every merge operand. If this - // method returns true, the merge operand will be ignored and not written out - // in the compaction output + // The table file creation process invokes this method on every merge operand. + // If this method returns true, the merge operand will be ignored and not + // written out in the new table file. // // Note: If you are using a TransactionDB, it is not recommended to implement // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB @@ -142,14 +138,16 @@ class CompactionFilter { // snapshot - beware if you're using TransactionDB or // DB::GetSnapshot(). // - If value for a key was overwritten or merged into (multiple Put()s - // or Merge()s), and compaction filter skips this key with + // or Merge()s), and `CompactionFilter` skips this key with // kRemoveAndSkipUntil, it's possible that it will remove only // the new value, exposing the old value that was supposed to be // overwritten. // - Doesn't work with PlainTableFactory in prefix mode. - // - If you use kRemoveAndSkipUntil, consider also reducing - // compaction_readahead_size option. + // - If you use kRemoveAndSkipUntil for table files created by + // compaction, consider also reducing compaction_readahead_size + // option. // + // Should never return kUndetermined. // Note: If you are using a TransactionDB, it is not recommended to filter // out or modify merge operands (ValueType::kMergeOperand). // If a merge operation is filtered out, TransactionDB may not realize there @@ -187,27 +185,53 @@ class CompactionFilter { } // This function is deprecated. Snapshots will always be ignored for - // compaction filters, because we realized that not ignoring snapshots doesn't - // provide the gurantee we initially thought it would provide. Repeatable - // reads will not be guaranteed anyway. If you override the function and - // returns false, we will fail the compaction. + // `CompactionFilter`s, because we realized that not ignoring snapshots + // doesn't provide the guarantee we initially thought it would provide. + // Repeatable reads will not be guaranteed anyway. If you override the + // function and returns false, we will fail the table file creation. virtual bool IgnoreSnapshots() const { return true; } - // Returns a name that identifies this compaction filter. + // Returns a name that identifies this `CompactionFilter`. // The name will be printed to LOG file on start up for diagnosis. virtual const char* Name() const = 0; + + // Internal (BlobDB) use only. Do not override in application code. + virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; } + + // In the case of BlobDB, it may be possible to reach a decision with only + // the key without reading the actual value. Keys whose value_type is + // kBlobIndex will be checked by this method. + // Returning kUndetermined will cause FilterV2() to be called to make a + // decision as usual. + virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/, + std::string* /*new_value*/, + std::string* /*skip_until*/) const { + return Decision::kUndetermined; + } }; -// Each compaction will create a new CompactionFilter allowing the -// application to know about different compactions +// Each thread of work involving creating table files will create a new +// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This +// allows the application to know about the different ongoing threads of work +// and makes it unnecessary for `CompactionFilter` to provide thread-safety. class CompactionFilterFactory { public: virtual ~CompactionFilterFactory() {} + // Returns whether a thread creating table files for the specified `reason` + // should invoke `CreateCompactionFilter()` and pass KVs through the returned + // filter. + virtual bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const { + // For backward compatibility, default implementation only applies + // `CompactionFilter` to files generated by compaction. + return reason == TableFileCreationReason::kCompaction; + } + virtual std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) = 0; - // Returns a name that identifies this compaction filter factory. + // Returns a name that identifies this `CompactionFilter` factory. virtual const char* Name() const = 0; }; diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index 0f9c8fcbb1f..626f3202ff9 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -25,25 +25,33 @@ struct CompactionJobStats { // the number of compaction input records. uint64_t num_input_records; - // the number of compaction input files. + // the number of blobs read from blob files + uint64_t num_blobs_read; + // the number of compaction input files (table files) size_t num_input_files; - // the number of compaction input files at the output level. + // the number of compaction input files at the output level (table files) size_t num_input_files_at_output_level; // the number of compaction output records. uint64_t num_output_records; - // the number of compaction output files. + // the number of compaction output files (table files) size_t num_output_files; + // the number of compaction output files (blob files) + size_t num_output_files_blob; // true if the compaction is a full compaction (all live SST files input) bool is_full_compaction; // true if the compaction is a manual compaction bool is_manual_compaction; - // the size of the compaction input in bytes. + // the total size of table files in the compaction input uint64_t total_input_bytes; - // the size of the compaction output in bytes. + // the total size of blobs read from blob files + uint64_t total_blob_bytes_read; + // the total size of table files in the compaction output uint64_t total_output_bytes; + // the total size of blob files in the compaction output + uint64_t total_output_bytes_blob; // number of records being replaced by newer record associated with same key. // this could be a new value or a deletion entry for that key so this field diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 53a46ad3359..37c2925bc33 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -110,7 +110,9 @@ class Comparator { // == 0 iff t1 == t2 // > 0 iff t1 > t2 // Note that an all-zero byte array will be the smallest (oldest) timestamp - // of the same length. + // of the same length, and a byte array with all bits 1 will be the largest. + // In the future, we can extend Comparator so that subclasses can specify + // both largest and smallest timestamps. virtual int CompareTimestamp(const Slice& /*ts1*/, const Slice& /*ts2*/) const { return 0; @@ -121,6 +123,11 @@ class Comparator { return Compare(a, b); } + virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const { + return 0 == + CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true); + } + private: size_t timestamp_size_; }; diff --git a/include/rocksdb/concurrent_task_limiter.h b/include/rocksdb/concurrent_task_limiter.h index 4fc6b794063..f8e7ed8ab44 100644 --- a/include/rocksdb/concurrent_task_limiter.h +++ b/include/rocksdb/concurrent_task_limiter.h @@ -33,7 +33,7 @@ class ConcurrentTaskLimiter { virtual int32_t GetOutstandingTask() const = 0; }; -// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs +// Create a ConcurrentTaskLimiter that can be shared with multiple CFs // across RocksDB instances to control concurrent tasks. // // @param name: Name of the limiter. diff --git a/include/rocksdb/configurable.h b/include/rocksdb/configurable.h index f4bfbf53285..b56072dbeae 100644 --- a/include/rocksdb/configurable.h +++ b/include/rocksdb/configurable.h @@ -28,7 +28,7 @@ struct DBOptions; // standard way of configuring objects. A Configurable object can: // -> Populate itself given: // - One or more "name/value" pair strings -// - A string repesenting the set of name=value properties +// - A string representing the set of name=value properties // - A map of name/value properties. // -> Convert itself into its string representation // -> Dump itself to a Logger @@ -166,7 +166,7 @@ class Configurable { // This is the inverse of ConfigureFromString. // @param config_options Controls how serialization happens. // @param result The string representation of this object. - // @return OK If the options for this object wer successfully serialized. + // @return OK If the options for this object were successfully serialized. // @return InvalidArgument If one or more of the options could not be // serialized. Status GetOptionString(const ConfigOptions& config_options, @@ -240,7 +240,7 @@ class Configurable { // @param config_options Controls how the object is prepared. Also contains // a Logger and Env that can be used to initialize this object. // @return OK If the object was successfully initialized. - // @return InvalidArgument If this object could not be successfull + // @return InvalidArgument If this object could not be successfully // initialized. virtual Status PrepareOptions(const ConfigOptions& config_options); @@ -270,18 +270,13 @@ class Configurable { // True once the object is prepared. Once the object is prepared, only // mutable options can be configured. bool prepared_; - // If this class is a wrapper (has-a), this method should be - // over-written to return the inner configurable (like an EnvWrapper). - // This method should NOT recurse, but should instead return the - // direct Inner object. - virtual Configurable* Inner() const { return nullptr; } // Returns the raw pointer for the associated named option. // The name is typically the name of an option registered via the // Classes may override this method to provide further specialization (such as // returning a sub-option) // - // The default implemntation looks at the registered options. If the + // The default implementation looks at the registered options. If the // input name matches that of a registered option, the pointer registered // with that name is returned. // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns @@ -355,6 +350,35 @@ class Configurable { // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) virtual std::string GetOptionName(const std::string& long_name) const; + // Registers the input name with the options and associated map. + // When classes register their options in this manner, most of the + // functionality (excluding unknown options and validate/prepare) is + // implemented by the base class. + // + // This method should be called in the class constructor to register the + // option set for this object. For example, to register the options + // associated with the BlockBasedTableFactory, the constructor calls this + // method passing in: + // - the name of the options ("BlockBasedTableOptions"); + // - the options object (the BlockBasedTableOptions object for this object; + // - the options type map for the BlockBasedTableOptions. + // This registration allows the Configurable class to process the option + // values associated with the BlockBasedTableOptions without further code in + // the derived class. + // + // @param name The name of this set of options (@see GetOptionsPtr) + // @param opt_ptr Pointer to the options to associate with this name + // @param opt_map Options map that controls how this option is configured. + template + void RegisterOptions( + T* opt_ptr, + const std::unordered_map* opt_map) { + RegisterOptions(T::kName(), opt_ptr, opt_map); + } + void RegisterOptions( + const std::string& name, void* opt_ptr, + const std::unordered_map* opt_map); + private: // Contains the collection of options (name, opt_ptr, opt_map) associated with // this object. This collection is typically set in the constructor of the diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index f861b2fcf3d..dfcd7e22929 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -16,6 +16,9 @@ namespace ROCKSDB_NAMESPACE { class Env; +class Logger; +class ObjectRegistry; + struct ColumnFamilyOptions; struct DBOptions; struct Options; @@ -27,6 +30,15 @@ struct Options; // of the serialization (e.g. delimiter), and how to compare // options (sanity_level). struct ConfigOptions { + // Constructs a new ConfigOptions with a new object registry. + // This method should only be used when a DBOptions is not available, + // else registry settings may be lost + ConfigOptions(); + + // Constructs a new ConfigOptions using the settings from + // the input DBOptions. Currently constructs a new object registry. + explicit ConfigOptions(const DBOptions&); + // This enum defines the RocksDB options sanity level. enum SanityLevel : unsigned char { kSanityLevelNone = 0x01, // Performs no sanity check at all. @@ -56,6 +68,13 @@ struct ConfigOptions { // Whether or not to invoke PrepareOptions after configure is called. bool invoke_prepare_options = true; + // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not. + // When "mutable_options_only=false", all options are evaluated. + // When "mutable_options_only="true", any option not marked as Mutable is + // either ignored (in the case of string/equals methods) or results in an + // error (in the case of Configure). + bool mutable_options_only = false; + // The separator between options when converting to a string std::string delimiter = ";"; @@ -71,6 +90,11 @@ struct ConfigOptions { // The environment to use for this option Env* env = Env::Default(); +#ifndef ROCKSDB_LITE + // The object registry to use for this options + std::shared_ptr registry; +#endif + bool IsShallow() const { return depth == Depth::kDepthShallow; } bool IsDetailed() const { return depth == Depth::kDepthDetailed; } @@ -86,7 +110,7 @@ struct ConfigOptions { #ifndef ROCKSDB_LITE // The following set of functions provide a way to construct RocksDB Options -// from a string or a string-to-string map. Here're the general rule of +// from a string or a string-to-string map. Here is the general rule of // setting option values from strings by type. Some RocksDB types are also // supported in these APIs. Please refer to the comment of the function itself // to find more information about how to config those RocksDB types. @@ -142,7 +166,7 @@ struct ConfigOptions { // ColumnFamilyOptions "new_options". // // Below are the instructions of how to config some non-primitive-typed -// options in ColumnFOptions: +// options in ColumnFamilyOptions: // // * table_factory: // table_factory can be configured using our custom nested-option syntax. @@ -184,7 +208,7 @@ struct ConfigOptions { // * {"memtable", "skip_list:5"} is equivalent to setting // memtable to SkipListFactory(5). // - PrefixHash: -// Pass "prfix_hash:" to config memtable +// Pass "prefix_hash:" to config memtable // to use PrefixHash, or simply "prefix_hash" to use the default // PrefixHash. // [Example]: @@ -493,7 +517,6 @@ Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const ReadOptions& read_options, const std::string& file_path); - #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/customizable.h b/include/rocksdb/customizable.h new file mode 100644 index 00000000000..24ddfa56c0c --- /dev/null +++ b/include/rocksdb/customizable.h @@ -0,0 +1,156 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "rocksdb/configurable.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { +/** + * Customizable a base class used by the rocksdb that describes a + * standard way of configuring and creating objects. Customizable objects + * are configurable objects that can be created from an ObjectRegistry. + * + * Customizable classes are used when there are multiple potential + * implementations of a class for use by RocksDB (e.g. Table, Cache, + * MergeOperator, etc). The abstract base class is expected to define a method + * declaring its type and a factory method for creating one of these, such as: + * static const char *Type() { return "Table"; } + * static Status CreateFromString(const ConfigOptions& options, + * const std::string& id, + * std::shared_ptr* result); + * The "Type" string is expected to be unique (no two base classes are the same + * type). This factory is expected, based on the options and id, create and + * return the appropriate derived type of the customizable class (e.g. + * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers, + * helper classes and methods are provided for writing this factory. + * + * Instances of a Customizable class need to define: + * - A "static const char *kClassName()" method. This method defines the name + * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the + * CheckedCast method. + * - The Name() of the object. This name is used when creating and saving + * instances of this class. Typically this name will be the same as + * kClassName(). + * + * Additionally, Customizable classes should register any options used to + * configure themselves with the Configurable subsystem. + * + * When a Customizable is being created, the "name" property specifies + * the name of the instance being created. + * For custom objects, their configuration and name can be specified by: + * [prop]={name=X;option 1 = value1[; option2=value2...]} + * + * [prop].name=X + * [prop].option1 = value1 + * + * [prop].name=X + * X.option1 =value1 + */ +class Customizable : public Configurable { + public: + virtual ~Customizable() {} + + // Returns the name of this class of Customizable + virtual const char* Name() const = 0; + + // Returns an identifier for this Customizable. + // This could be its name or something more complex (like its URL/pattern). + // Used for pretty printing. + virtual std::string GetId() const { + std::string id = Name(); + return id; + } + + // This is typically determined by if the input name matches the + // name of this object. + // This method is typically used in conjunction with CheckedCast to find the + // derived class instance from its base. For example, if you have an Env + // and want the "Default" env, you would IsInstanceOf("Default") to get + // the default implementation. This method should be used when you need a + // specific derivative or implementation of a class. + // + // Intermediary caches (such as SharedCache) may wish to override this method + // to check for the intermediary name (SharedCache). Classes with multiple + // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override + // this method. + // + // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a". + // Wrapped classes that have an Inner "has-a" should not be returned. + // + // @param name The name of the instance to find. + // Returns true if the class is an instance of the input name. + virtual bool IsInstanceOf(const std::string& name) const { + return name == Name(); + } + + // Returns the named instance of the Customizable as a T*, or nullptr if not + // found. This method uses IsInstanceOf/Inner to find the appropriate class + // instance and then casts it to the expected return type. + template + const T* CheckedCast() const { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + const auto inner = Inner(); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + template + T* CheckedCast() { + if (IsInstanceOf(T::kClassName())) { + return static_cast(this); + } else { + auto inner = const_cast(Inner()); + if (inner != nullptr) { + return inner->CheckedCast(); + } else { + return nullptr; + } + } + } + + // Checks to see if this Customizable is equivalent to other. + // This method assumes that the two objects are of the same class. + // @param config_options Controls how the options are compared. + // @param other The other object to compare to. + // @param mismatch If the objects do not match, this parameter contains + // the name of the option that triggered the match failure. + // @param True if the objects match, false otherwise. + // @see Configurable::AreEquivalent for more details + bool AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const override; +#ifndef ROCKSDB_LITE + // Gets the value of the option associated with the input name + // @see Configurable::GetOption for more details + Status GetOption(const ConfigOptions& config_options, const std::string& name, + std::string* value) const override; +#endif // ROCKSDB_LITE + + // Returns the inner class when a Customizable implements a has-a (wrapped) + // relationship. Derived classes that implement a has-a must override this + // method in order to get CheckedCast to function properly. + virtual const Customizable* Inner() const { return nullptr; } + + protected: + // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) + std::string GetOptionName(const std::string& long_name) const override; +#ifndef ROCKSDB_LITE + std::string SerializeOptions(const ConfigOptions& options, + const std::string& prefix) const override; +#endif // ROCKSDB_LITE +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h new file mode 100644 index 00000000000..c9a4ebd829b --- /dev/null +++ b/include/rocksdb/data_structure.h @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// This is a data structure specifically designed as a "Set" for a +// pretty small scale of Enum structure. For now, it can support up +// to 64 element, and it is expandable in the future. +template +class SmallEnumSet { + public: + SmallEnumSet() : state_(0) {} + + ~SmallEnumSet() {} + + // Return true if the input enum is included in the "Set" (i.e., changes the + // internal scalar state successfully), otherwise, it will return false. + bool Add(const ENUM_TYPE value) { + static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); + assert(value >= 0 && value <= MAX_VALUE); + uint64_t old_state = state_; + uint64_t tmp = 1; + state_ |= (tmp << value); + return old_state != state_; + } + + // Return true if the input enum is contained in the "Set". + bool Contains(const ENUM_TYPE value) { + static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); + assert(value >= 0 && value <= MAX_VALUE); + uint64_t tmp = 1; + return state_ & (tmp << value); + } + + private: + uint64_t state_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index e31042170f8..88bd00004b1 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -115,7 +115,7 @@ struct RangePtr { }; // It is valid that files_checksums and files_checksum_func_names are both -// empty (no checksum informaiton is provided for ingestion). Otherwise, +// empty (no checksum information is provided for ingestion). Otherwise, // their sizes should be the same as external_files. The file order should // be the same in three vectors and guaranteed by the caller. struct IngestExternalFileArg { @@ -143,11 +143,15 @@ typedef std::unordered_map> // and a number of wrapper implementations. class DB { public: - // Open the database with the specified "name". + // Open the database with the specified "name" for reads and writes. // Stores a pointer to a heap-allocated database in *dbptr and returns // OK on success. - // Stores nullptr in *dbptr and returns a non-OK status on error. - // Caller should delete *dbptr when it is no longer needed. + // Stores nullptr in *dbptr and returns a non-OK status on error, including + // if the DB is already open (read-write) by another DB object. (This + // guarantee depends on options.env->LockFile(), which might not provide + // this guarantee in a custom Env implementation.) + // + // Caller must delete *dbptr when it is no longer needed. static Status Open(const Options& options, const std::string& name, DB** dbptr); @@ -156,6 +160,12 @@ class DB { // If the db is opened in read only mode, then no compactions // will happen. // + // While a given DB can be simultaneously open via OpenForReadOnly + // by any number of readers, if a DB is simultaneously open by Open + // and OpenForReadOnly, the read-only instance has undefined behavior + // (though can often succeed if quickly closed) and the read-write + // instance is unaffected. See also OpenAsSecondary. + // // Not supported in ROCKSDB_LITE, in which case the function will // return Status::NotSupported. static Status OpenForReadOnly(const Options& options, const std::string& name, @@ -168,6 +178,12 @@ class DB { // column family. The default column family name is 'default' and it's stored // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName // + // While a given DB can be simultaneously open via OpenForReadOnly + // by any number of readers, if a DB is simultaneously open by Open + // and OpenForReadOnly, the read-only instance has undefined behavior + // (though can often succeed if quickly closed) and the read-write + // instance is unaffected. See also OpenAsSecondary. + // // Not supported in ROCKSDB_LITE, in which case the function will // return Status::NotSupported. static Status OpenForReadOnly( @@ -208,11 +224,11 @@ class DB { // to open the primary instance. // The secondary_path argument points to a directory where the secondary // instance stores its info log. - // The column_families argument specifieds a list of column families to open. + // The column_families argument specifies a list of column families to open. // If any of the column families does not exist, the function returns non-OK // status. // The handles is an out-arg corresponding to the opened database column - // familiy handles. + // family handles. // The dbptr is an out-arg corresponding to the opened secondary instance. // The pointer points to a heap-allocated database, and the caller should // delete it after use. Before deleting the dbptr, the user should also @@ -242,6 +258,16 @@ class DB { const std::vector& column_families, std::vector* handles, DB** dbptr); + // Open DB and run the compaction. + // It's a read-only operation, the result won't be installed to the DB, it + // will be output to the `output_directory`. The API should only be used with + // `options.CompactionService` to run compaction triggered by + // `CompactionService`. + static Status OpenAndCompact( + const std::string& name, const std::string& output_directory, + const std::string& input, std::string* output, + const CompactionServiceOptionsOverride& override_options); + virtual Status Resume() { return Status::NotSupported(); } // Close the DB by releasing resources, closing files etc. This should be @@ -716,7 +742,9 @@ class DB { virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; #ifndef ROCKSDB_LITE - // Contains all valid property arguments for GetProperty(). + // Contains all valid property arguments for GetProperty() or + // GetMapProperty(). Each is a "string" property for retrieval with + // GetProperty() unless noted as a "map" property, for GetMapProperty(). // // NOTE: Property names cannot end in numbers since those are interpreted as // arguments, e.g., see kNumFilesAtLevelPrefix. @@ -741,19 +769,14 @@ class DB { // SST files. static const std::string kSSTables; - // "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and - // "rocksdb.cf-file-histogram" together. See below for description - // of the two. + // "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram" + // and "rocksdb.cf-file-histogram" as a "map" property. static const std::string kCFStats; // "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with - // general columm family stats per-level over db's lifetime ("L"), + // general column family stats per-level over db's lifetime ("L"), // aggregated over db's lifetime ("Sum"), and aggregated over the // interval since the last retrieval ("Int"). - // It could also be used to return the stats in the format of the map. - // In this case there will a pair of string to array of double for - // each level as well as for "Sum". "Int" stats will not be affected - // when this form of stats are retrieved. static const std::string kCFStatsNoFileHistogram; // "rocksdb.cf-file-histogram" - print out how many file reads to every @@ -769,6 +792,10 @@ class DB { // of files per level and total size of each level (MB). static const std::string kLevelStats; + // "rocksdb.block-cache-entry-stats" - returns a multi-line string or + // map with statistics on block cache usage. + static const std::string kBlockCacheEntryStats; + // "rocksdb.num-immutable-mem-table" - returns number of immutable // memtables that have not yet been flushed. static const std::string kNumImmutableMemTable; @@ -894,8 +921,10 @@ class DB { // based. static const std::string kEstimatePendingCompactionBytes; - // "rocksdb.aggregated-table-properties" - returns a string representation - // of the aggregated table properties of the target column family. + // "rocksdb.aggregated-table-properties" - returns a string or map + // representation of the aggregated table properties of the target + // column family. Only properties that make sense for aggregation + // are included. static const std::string kAggregatedTableProperties; // "rocksdb.aggregated-table-properties-at-level", same as the previous @@ -933,15 +962,19 @@ class DB { }; #endif /* ROCKSDB_LITE */ - // DB implementations can export properties about their state via this method. - // If "property" is a valid property understood by this DB implementation (see - // Properties struct above for valid options), fills "*value" with its current - // value and returns true. Otherwise, returns false. + // DB implementations export properties about their state via this method. + // If "property" is a valid "string" property understood by this DB + // implementation (see Properties struct above for valid options), fills + // "*value" with its current value and returns true. Otherwise, returns + // false. virtual bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) = 0; virtual bool GetProperty(const Slice& property, std::string* value) { return GetProperty(DefaultColumnFamily(), property, value); } + + // Like GetProperty but for valid "map" properties. (Some properties can be + // accessed as either "string" properties or "map" properties.) virtual bool GetMapProperty(ColumnFamilyHandle* column_family, const Slice& property, std::map* value) = 0; @@ -1025,21 +1058,24 @@ class DB { uint64_t* sizes) = 0; // Simpler versions of the GetApproximateSizes() method above. - // The include_flags argumenbt must of type DB::SizeApproximationFlags + // The include_flags argument must of type DB::SizeApproximationFlags // and can not be NONE. - virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* ranges, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) { + virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family, + const Range* ranges, int n, + uint64_t* sizes, + uint8_t include_flags = INCLUDE_FILES) { SizeApproximationOptions options; options.include_memtabtles = (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; options.include_files = (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; - GetApproximateSizes(options, column_family, ranges, n, sizes); + return GetApproximateSizes(options, column_family, ranges, n, sizes); } - virtual void GetApproximateSizes(const Range* ranges, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) { - GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, include_flags); + virtual Status GetApproximateSizes(const Range* ranges, int n, + uint64_t* sizes, + uint8_t include_flags = INCLUDE_FILES) { + return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, + include_flags); } // The method is similar to GetApproximateSizes, except it @@ -1079,6 +1115,8 @@ class DB { // and the data is rearranged to reduce the cost of operations // needed to access the data. This operation should typically only // be invoked by users who understand the underlying implementation. + // This call blocks until the operation completes successfully, fails, + // or is aborted (Status::Incomplete). See DisableManualCompaction. // // begin==nullptr is treated as a key before all keys in the database. // end==nullptr is treated as a key after all keys in the database. @@ -1133,9 +1171,9 @@ class DB { const std::unordered_map& new_options) = 0; // CompactFiles() inputs a list of files specified by file numbers and - // compacts them to the specified level. Note that the behavior is different - // from CompactRange() in that CompactFiles() performs the compaction job - // using the CURRENT thread. + // compacts them to the specified level. A small difference compared to + // CompactRange() is that CompactFiles() performs the compaction job + // using the CURRENT thread, so is not considered a "background" job. // // @see GetDataBaseMetaData // @see GetColumnFamilyMetaData @@ -1177,7 +1215,16 @@ class DB { virtual Status EnableAutoCompaction( const std::vector& column_family_handles) = 0; + // After this function call, CompactRange() or CompactFiles() will not + // run compactions and fail. Calling this function will tell outstanding + // manual compactions to abort and will wait for them to finish or abort + // before returning. virtual void DisableManualCompaction() = 0; + // Re-enable CompactRange() and ComapctFiles() that are disabled by + // DisableManualCompaction(). This function must be called as many times + // as DisableManualCompaction() has been called in order to re-enable + // manual compactions, and must not be called more times than + // DisableManualCompaction() has been called. virtual void EnableManualCompaction() = 0; // Number of levels used for this DB. @@ -1365,7 +1412,7 @@ class DB { virtual void GetLiveFilesMetaData( std::vector* /*metadata*/) {} - // Return a list of all table file checksum info. + // Return a list of all table and blob files checksum info. // Note: This function might be of limited use because it cannot be // synchronized with GetLiveFiles. virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0; @@ -1446,6 +1493,12 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of + // table files are checked. + virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { + return Status::NotSupported("File verification not supported"); + } + // Verify the block checksums of files in db. The block checksums of table // files are checked. virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; @@ -1602,14 +1655,14 @@ class DB { return Status::NotSupported("EndTrace() is not implemented."); } - // StartIOTrace and EndIOTrace are experimental. They are not enabled yet. - virtual Status StartIOTrace(Env* /*env*/, const TraceOptions& /*options*/, + // IO Tracing operations. Use EndIOTrace() to stop tracing. + virtual Status StartIOTrace(const TraceOptions& /*options*/, std::unique_ptr&& /*trace_writer*/) { - return Status::NotSupported("StartTrace() is not implemented."); + return Status::NotSupported("StartIOTrace() is not implemented."); } virtual Status EndIOTrace() { - return Status::NotSupported("StartTrace() is not implemented."); + return Status::NotSupported("EndIOTrace() is not implemented."); } // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 955d591c310..a4463060a90 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -17,12 +17,15 @@ #pragma once #include + #include #include #include #include #include #include + +#include "rocksdb/functor_wrapper.h" #include "rocksdb/status.h" #include "rocksdb/thread_status.h" @@ -30,11 +33,12 @@ // Windows API macro interference #undef DeleteFile #undef GetCurrentTime +#undef LoadLibrary #endif #if defined(__GNUC__) || defined(__clang__) #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \ - __attribute__((__format__(__printf__, format_param, dots_param))) + __attribute__((__format__(__printf__, format_param, dots_param))) #else #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) #endif @@ -47,6 +51,7 @@ class Logger; class RandomAccessFile; class SequentialFile; class Slice; +struct DataVerificationInfo; class WritableFile; class RandomRWFile; class MemoryMappedFileBuffer; @@ -58,6 +63,8 @@ class RateLimiter; class ThreadStatusUpdater; struct ThreadStatus; class FileSystem; +class SystemClock; +struct ConfigOptions; const size_t kDefaultPageSize = 4 * 1024; @@ -149,8 +156,11 @@ class Env { }; Env(); - // Construct an Env with a separate FileSystem implementation - Env(std::shared_ptr fs); + // Construct an Env with a separate FileSystem and/or SystemClock + // implementation + explicit Env(const std::shared_ptr& fs); + Env(const std::shared_ptr& fs, + const std::shared_ptr& clock); // No copying allowed Env(const Env&) = delete; void operator=(const Env&) = delete; @@ -160,12 +170,44 @@ class Env { static const char* Type() { return "Environment"; } // Loads the environment specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. static Status LoadEnv(const std::string& value, Env** result); // Loads the environment specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. static Status LoadEnv(const std::string& value, Env** result, std::shared_ptr* guard); + // Loads the environment specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // + // @param config_options Controls how the environment is loaded. + // @param value the name and associated properties for the environment. + // @param result On success, the environment that was loaded. + // @param guard If specified and the loaded environment is not static, + // this value will contain the loaded environment (guard.get() == + // result). + // @return OK If the environment was successfully loaded (and optionally + // prepared) + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result); + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, Env** result, + std::shared_ptr* guard); + + // Loads the environment specified by the env and fs uri. + // If both are specified, an error is returned. + // Otherwise, the environment is created by loading (via CreateFromString) + // the appropriate env/fs from the corresponding values. + static Status CreateFromUri(const ConfigOptions& options, + const std::string& env_uri, + const std::string& fs_uri, Env** result, + std::shared_ptr* guard); + // Return a default environment suitable for the current operating // system. Sophisticated users may wish to provide their own Env // implementation instead of relying on this default environment. @@ -282,7 +324,8 @@ class Env { virtual Status FileExists(const std::string& fname) = 0; // Store in *result the names of the children of the specified directory. - // The names are relative to "dir". + // The names are relative to "dir", and shall never include the + // names `.` or `..`. // Original contents of *results are dropped. // Returns OK if "dir" exists and "*result" contains its children. // NotFound if "dir" does not exist, the calling process does not have @@ -295,7 +338,8 @@ class Env { // In case the implementation lists the directory prior to iterating the files // and files are concurrently deleted, the deleted files will be omitted from // result. - // The name attributes are relative to "dir". + // The name attributes are relative to "dir", and shall never include the + // names `.` or `..`. // Original contents of *results are dropped. // Returns OK if "dir" exists and "*result" contains its children. // NotFound if "dir" does not exist, the calling process does not have @@ -414,6 +458,21 @@ class Env { // When "function(arg)" returns, the thread will be destroyed. virtual void StartThread(void (*function)(void* arg), void* arg) = 0; + // Start a new thread, invoking "function(args...)" within the new thread. + // When "function(args...)" returns, the thread will be destroyed. + template + void StartThreadTyped(FunctionT function, Args&&... args) { + using FWType = FunctorWrapper; + StartThread( + [](void* arg) { + auto* functor = static_cast(arg); + functor->invoke(); + delete functor; + }, + new FWType(std::function(function), + std::forward(args)...)); + } + // Wait for all threads started by StartThread to terminate. virtual void WaitForJoin() {} @@ -429,7 +488,7 @@ class Env { virtual Status GetTestDirectory(std::string* path) = 0; // Create and returns a default logger (an instance of EnvLogger) for storing - // informational messages. Derived classes can overide to provide custom + // informational messages. Derived classes can override to provide custom // logger. virtual Status NewLogger(const std::string& fname, std::shared_ptr* result); @@ -452,9 +511,15 @@ class Env { // Sleep/delay the thread for the prescribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; - // Get the current host name. + // Get the current host name as a null terminated string iff the string + // length is < len. The hostname should otherwise be truncated to len. virtual Status GetHostName(char* name, uint64_t len) = 0; + // Get the current hostname from the given env as a std::string in result. + // The result may be truncated if the hostname is too + // long + virtual Status GetHostNameString(std::string* result); + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). // Only overwrites *unix_time on success. virtual Status GetCurrentTime(int64_t* unix_time) = 0; @@ -532,6 +597,13 @@ class Env { const EnvOptions& env_options, const ImmutableDBOptions& db_options) const; + // OptimizeForBlobFileRead will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // blob files. + virtual EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + // Returns the status of all threads that belong to the current Env. virtual Status GetThreadList(std::vector* /*thread_list*/) { return Status::NotSupported("Env::GetThreadList() not supported."); @@ -567,6 +639,10 @@ class Env { // could be a fully implemented one, or a wrapper class around the Env const std::shared_ptr& GetFileSystem() const; + // Get the SystemClock implementation this Env was constructed with. It + // could be a fully implemented one, or a wrapper class around the Env + const std::shared_ptr& GetSystemClock() const; + // If you're adding methods here, remember to add them to EnvWrapper too. protected: @@ -576,6 +652,12 @@ class Env { // Pointer to the underlying FileSystem implementation std::shared_ptr file_system_; + + // Pointer to the underlying SystemClock implementation + std::shared_ptr system_clock_; + + private: + static const size_t kMaxHostNameLen = 256; }; // The factory function to construct a ThreadStatusUpdater. Any Env @@ -596,6 +678,10 @@ class SequentialFile { // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // REQUIRES: External synchronization virtual Status Read(size_t n, Slice* result, char* scratch) = 0; @@ -641,7 +727,8 @@ struct ReadRequest { // File offset in bytes uint64_t offset; - // Length to read in bytes + // Length to read in bytes. `result` only returns fewer bytes if end of file + // is hit (or `status` is not OK). size_t len; // A buffer that MultiRead() can optionally place data in. It can @@ -670,6 +757,10 @@ class RandomAccessFile { // "*result" is used. If an error was encountered, returns a non-OK // status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. virtual Status Read(uint64_t offset, size_t n, Slice* result, @@ -765,10 +856,22 @@ class WritableFile { virtual ~WritableFile(); // Append data to the end of the file - // Note: A WriteabelFile object must support either Append or + // Note: A WriteableFile object must support either Append or // PositionedAppend, so the users cannot mix the two. virtual Status Append(const Slice& data) = 0; + // Append data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status Append(const Slice& data, + const DataVerificationInfo& /* verification_info */) { + return Append(data); + } + // PositionedAppend data to the specified offset. The new EOF after append // must be larger than the previous EOF. This is to be used when writes are // not backed by OS buffers and hence has to always start from the start of @@ -795,6 +898,19 @@ class WritableFile { "WritableFile::PositionedAppend() not supported."); } + // PositionedAppend data with verification information. + // Note that this API change is experimental and it might be changed in + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if currently ChecksumType::kCRC32C is not supported by + // WritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). + virtual Status PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const DataVerificationInfo& /* verification_info */) { + return Status::NotSupported("PositionedAppend"); + } + // Truncate is necessary to trim the file to the correct size // before closing. It is not always possible to keep track of the file // size due to whole pages writes. The behavior is undefined if called @@ -954,6 +1070,11 @@ class RandomRWFile { // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Returns Status::OK() on success. virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const = 0; @@ -1042,11 +1163,17 @@ class Logger { virtual void LogHeader(const char* format, va_list ap) { // Default implementation does a simple INFO level log write. // Please override as per the logger class requirement. - Logv(format, ap); + Logv(InfoLogLevel::INFO_LEVEL, format, ap); } // Write an entry to the log file with the specified format. - virtual void Logv(const char* format, va_list ap) = 0; + // + // Users who override the `Logv()` overload taking `InfoLogLevel` do not need + // to implement this, unless they explicitly invoke it in + // `Logv(InfoLogLevel, ...)`. + virtual void Logv(const char* /* format */, va_list /* ap */) { + assert(false); + } // Write an entry to the log file with the specified log level // and format. Any log with level under the internal log level @@ -1426,6 +1553,11 @@ class EnvWrapper : public Env { const ImmutableDBOptions& db_options) const override { return target_->OptimizeForCompactionTableRead(env_options, db_options); } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(env_options, db_options); + } Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { return target_->GetFreeSpace(path, diskfree); } @@ -1497,9 +1629,18 @@ class WritableFileWrapper : public WritableFile { explicit WritableFileWrapper(WritableFile* t) : target_(t) {} Status Append(const Slice& data) override { return target_->Append(data); } + Status Append(const Slice& data, + const DataVerificationInfo& verification_info) override { + return target_->Append(data, verification_info); + } Status PositionedAppend(const Slice& data, uint64_t offset) override { return target_->PositionedAppend(data, offset); } + Status PositionedAppend( + const Slice& data, uint64_t offset, + const DataVerificationInfo& verification_info) override { + return target_->PositionedAppend(data, offset, verification_info); + } Status Truncate(uint64_t size) override { return target_->Truncate(size); } Status Close() override { return target_->Close(); } Status Flush() override { return target_->Flush(); } @@ -1648,6 +1789,6 @@ Env* NewTimedEnv(Env* base_env); Status NewEnvLogger(const std::string& fname, Env* env, std::shared_ptr* result); -std::unique_ptr NewCompositeEnv(std::shared_ptr fs); +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index 6c29dc953e9..7a76ec867b7 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -10,6 +10,7 @@ #include #include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { @@ -72,7 +73,7 @@ class BlockCipher { // - ROT13 Create a ROT13 Cipher // - ROT13:nn Create a ROT13 Cipher with block size of nn // @param result The new cipher object - // @return OK if the cipher was sucessfully created + // @return OK if the cipher was successfully created // @return NotFound if an invalid name was specified in the value // @return InvalidArgument if either the options were not valid static Status CreateFromString(const ConfigOptions& config_options, @@ -117,7 +118,7 @@ class EncryptionProvider { // - CTR Create a CTR provider // - test://CTR Create a CTR provider and initialize it for tests. // @param result The new provider object - // @return OK if the provider was sucessfully created + // @return OK if the provider was successfully created // @return NotFound if an invalid name was specified in the value // @return InvalidArgument if either the options were not valid static Status CreateFromString(const ConfigOptions& config_options, @@ -171,9 +172,9 @@ class EncryptionProvider { virtual Status TEST_Initialize() { return Status::OK(); } }; -class EncryptedSequentialFile : public SequentialFile { +class EncryptedSequentialFile : public FSSequentialFile { protected: - std::unique_ptr file_; + std::unique_ptr file_; std::unique_ptr stream_; uint64_t offset_; size_t prefixLength_; @@ -181,7 +182,7 @@ class EncryptedSequentialFile : public SequentialFile { public: // Default ctor. Given underlying sequential file is supposed to be at // offset == prefixLength. - EncryptedSequentialFile(std::unique_ptr&& f, + EncryptedSequentialFile(std::unique_ptr&& f, std::unique_ptr&& s, size_t prefixLength) : file_(std::move(f)), @@ -197,7 +198,8 @@ class EncryptedSequentialFile : public SequentialFile { // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization - virtual Status Read(size_t n, Slice* result, char* scratch) override; + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; // Skip "n" bytes from the file. This is guaranteed to be no // slower that reading the same data, but may be faster. @@ -206,36 +208,37 @@ class EncryptedSequentialFile : public SequentialFile { // file, and Skip will return OK. // // REQUIRES: External synchronization - virtual Status Skip(uint64_t n) override; + IOStatus Skip(uint64_t n) override; // Indicates the upper layers if the current SequentialFile implementation // uses direct IO. - virtual bool use_direct_io() const override; + bool use_direct_io() const override; // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; // Positioned Read for direct I/O // If Direct I/O enabled, offset, n, and scratch should be properly aligned - virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; }; // A file abstraction for randomly reading the contents of a file. -class EncryptedRandomAccessFile : public RandomAccessFile { +class EncryptedRandomAccessFile : public FSRandomAccessFile { protected: - std::unique_ptr file_; + std::unique_ptr file_; std::unique_ptr stream_; size_t prefixLength_; public: - EncryptedRandomAccessFile(std::unique_ptr&& f, + EncryptedRandomAccessFile(std::unique_ptr&& f, std::unique_ptr&& s, size_t prefixLength) : file_(std::move(f)), @@ -252,11 +255,13 @@ class EncryptedRandomAccessFile : public RandomAccessFile { // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; // Readahead the file starting from offset by n bytes for caching. - virtual Status Prefetch(uint64_t offset, size_t n) override; + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override; // Tries to get an unique ID for this file that will be the same each time // the file is opened (and will stay the same while the file is open). @@ -273,71 +278,76 @@ class EncryptedRandomAccessFile : public RandomAccessFile { // a single varint. // // Note: these IDs are only valid for the duration of the process. - virtual size_t GetUniqueId(char* id, size_t max_size) const override; + size_t GetUniqueId(char* id, size_t max_size) const override; - virtual void Hint(AccessPattern pattern) override; + void Hint(AccessPattern pattern) override; // Indicates the upper layers if the current RandomAccessFile implementation // uses direct IO. - virtual bool use_direct_io() const override; + bool use_direct_io() const override; // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; }; // A file abstraction for sequential writing. The implementation // must provide buffering since callers may append small fragments // at a time to the file. -class EncryptedWritableFile : public WritableFileWrapper { +class EncryptedWritableFile : public FSWritableFile { protected: - std::unique_ptr file_; + std::unique_ptr file_; std::unique_ptr stream_; size_t prefixLength_; public: // Default ctor. Prefix is assumed to be written already. - EncryptedWritableFile(std::unique_ptr&& f, + EncryptedWritableFile(std::unique_ptr&& f, std::unique_ptr&& s, size_t prefixLength) - : WritableFileWrapper(f.get()), - file_(std::move(f)), + : file_(std::move(f)), stream_(std::move(s)), prefixLength_(prefixLength) {} - Status Append(const Slice& data) override; + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; - Status PositionedAppend(const Slice& data, uint64_t offset) override; + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; // Indicates the upper layers if the current WritableFile implementation // uses direct IO. - virtual bool use_direct_io() const override; + bool use_direct_io() const override; // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; /* * Get the size of valid data in the file. */ - virtual uint64_t GetFileSize() override; + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; // Truncate is necessary to trim the file to the correct size // before closing. It is not always possible to keep track of the file // size due to whole pages writes. The behavior is undefined if called // with other writes to follow. - virtual Status Truncate(uint64_t size) override; + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. // This call has no effect on dirty pages in the cache. - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; // Sync a file range with disk. // offset is the starting byte of the file range to be synchronized. @@ -345,28 +355,42 @@ class EncryptedWritableFile : public WritableFileWrapper { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override; + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, + IODebugContext* dbg) override; // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation // of space on devices where it can result in less file // fragmentation and/or less waste from over-zealous filesystem // pre-allocation. - virtual void PrepareWrite(size_t offset, size_t len) override; + void PrepareWrite(size_t offset, size_t len, const IOOptions& options, + IODebugContext* dbg) override; + + void SetPreallocationBlockSize(size_t size) override; + + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override; // Pre-allocates space for a file. - virtual Status Allocate(uint64_t offset, uint64_t len) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; + + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; + + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; }; // A file abstraction for random reading and writing. -class EncryptedRandomRWFile : public RandomRWFile { +class EncryptedRandomRWFile : public FSRandomRWFile { protected: - std::unique_ptr file_; + std::unique_ptr file_; std::unique_ptr stream_; size_t prefixLength_; public: - EncryptedRandomRWFile(std::unique_ptr&& f, + EncryptedRandomRWFile(std::unique_ptr&& f, std::unique_ptr&& s, size_t prefixLength) : file_(std::move(f)), @@ -375,31 +399,49 @@ class EncryptedRandomRWFile : public RandomRWFile { // Indicates if the class makes use of direct I/O // If false you must pass aligned buffer to Write() - virtual bool use_direct_io() const override; + bool use_direct_io() const override; // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override; + size_t GetRequiredBufferAlignment() const override; // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. - virtual Status Write(uint64_t offset, const Slice& data) override; + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. // Returns Status::OK() on success. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Fsync() override; + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; }; +class EncryptedFileSystem : public FileSystemWrapper { + public: + explicit EncryptedFileSystem(const std::shared_ptr& base) + : FileSystemWrapper(base) {} + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; +}; } // namespace ROCKSDB_NAMESPACE #endif // !defined(ROCKSDB_LITE) diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h index 37b1744ce47..00b2b9d5740 100644 --- a/include/rocksdb/file_checksum.h +++ b/include/rocksdb/file_checksum.h @@ -76,7 +76,7 @@ class FileChecksumGenFactory { }; // FileChecksumList stores the checksum information of a list of files (e.g., -// SST files). The FileChecksumLIst can be used to store the checksum +// SST files). The FileChecksumList can be used to store the checksum // information of all SST file getting from the MANIFEST, which are // the checksum information of all valid SST file of a DB instance. It can // also be used to store the checksum information of a list of SST files to @@ -116,7 +116,7 @@ class FileChecksumList { // Create a new file checksum list. extern FileChecksumList* NewFileChecksumList(); -// Return a shared_ptr of the builtin Crc32c based file checksum generatory +// Return a shared_ptr of the builtin Crc32c based file checksum generator // factory object, which can be shared to create the Crc32c based checksum // generator object. // Note: this implementation is compatible with many other crc32c checksum diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 3683491c1ba..025908e4f97 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -17,6 +17,7 @@ #pragma once #include + #include #include #include @@ -25,9 +26,11 @@ #include #include #include + #include "rocksdb/env.h" #include "rocksdb/io_status.h" #include "rocksdb/options.h" +#include "rocksdb/table.h" #include "rocksdb/thread_status.h" namespace ROCKSDB_NAMESPACE { @@ -43,6 +46,7 @@ class Slice; struct ImmutableDBOptions; struct MutableDBOptions; class RateLimiter; +struct ConfigOptions; using AccessPattern = RandomAccessFile::AccessPattern; using FileAttributes = Env::FileAttributes; @@ -97,16 +101,30 @@ struct FileOptions : EnvOptions { // to be issued for the file open/creation IOOptions io_options; - FileOptions() : EnvOptions() {} + // EXPERIMENTAL + // The feature is in development and is subject to change. + // When creating a new file, set the temperature of the file so that + // underlying file systems can put it with appropriate storage media and/or + // coding. + Temperature temperature = Temperature::kUnknown; + + // The checksum type that is used to calculate the checksum value for + // handoff during file writes. + ChecksumType handoff_checksum_type; + + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) - : EnvOptions(opts) {} + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const EnvOptions& opts) - : EnvOptions(opts) {} + : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const FileOptions& opts) - : EnvOptions(opts), io_options(opts.io_options) {} + : EnvOptions(opts), + io_options(opts.io_options), + temperature(opts.temperature), + handoff_checksum_type(opts.handoff_checksum_type) {} FileOptions& operator=(const FileOptions& opts) = default; }; @@ -123,12 +141,36 @@ struct IODebugContext { // To be set by the FileSystem implementation std::string msg; + // To be set by the underlying FileSystem implementation. + std::string request_id; + + // In order to log required information in IO tracing for different + // operations, Each bit in trace_data stores which corresponding info from + // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it + // means bit at position 0 is set so TraceData::kRequestID (request_id) will + // be logged in the trace record. + // + enum TraceData : char { + // The value of each enum represents the bitwise position for + // that information in trace_data which will be used by IOTracer for + // tracing. Make sure to add them sequentially. + kRequestID = 0, + }; + uint64_t trace_data = 0; + IODebugContext() {} void AddCounter(std::string& name, uint64_t value) { counters.emplace(name, value); } + // Called by underlying file system to set request_id and log request_id in + // IOTracing. + void SetRequestId(const std::string& _request_id) { + request_id = _request_id; + trace_data |= (1 << TraceData::kRequestID); + } + std::string ToString() { std::ostringstream ss; ss << file_path << ", "; @@ -168,9 +210,24 @@ class FileSystem { static const char* Type() { return "FileSystem"; } // Loads the FileSystem specified by the input value into the result + // The CreateFromString alternative should be used; this method may be + // deprecated in a future release. static Status Load(const std::string& value, std::shared_ptr* result); + // Loads the FileSystem specified by the input value into the result + // @see Customizable for a more detailed description of the parameters and + // return codes + // @param config_options Controls how the FileSystem is loaded + // @param value The name and optional properties describing the file system + // to load. + // @param result On success, returns the loaded FileSystem + // @return OK if the FileSystem was successfully loaded. + // @return not-OK if the load failed. + static Status CreateFromString(const ConfigOptions& options, + const std::string& value, + std::shared_ptr* result); + // Return a default fie_system suitable for the current operating // system. Sophisticated users may wish to provide their own Env // implementation instead of relying on this default file_system @@ -262,7 +319,7 @@ class FileSystem { virtual IOStatus ReopenWritableFile( const std::string& /*fname*/, const FileOptions& /*options*/, std::unique_ptr* /*result*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("ReopenWritableFile"); } // Reuse an existing file by renaming it and opening it as writable. @@ -366,6 +423,10 @@ class FileSystem { return IOStatus::OK(); } +// This seems to clash with a macro on Windows, so #undef it here +#ifdef DeleteFile +#undef DeleteFile +#endif // Delete the named file. virtual IOStatus DeleteFile(const std::string& fname, const IOOptions& options, @@ -460,7 +521,7 @@ class FileSystem { IODebugContext* dbg) = 0; // Create and returns a default logger (an instance of EnvLogger) for storing - // informational messages. Derived classes can overide to provide custom + // informational messages. Derived classes can override to provide custom // logger. virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, std::shared_ptr* result, @@ -513,6 +574,13 @@ class FileSystem { const FileOptions& file_options, const ImmutableDBOptions& db_options) const; + // OptimizeForBlobFileRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading blob files. + virtual FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + // This seems to clash with a macro on Windows, so #undef it here #ifdef GetFreeSpace #undef GetFreeSpace @@ -523,7 +591,7 @@ class FileSystem { const IOOptions& /*options*/, uint64_t* /*diskfree*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("GetFreeSpace"); } virtual IOStatus IsDirectory(const std::string& /*path*/, @@ -550,6 +618,10 @@ class FSSequentialFile { // "scratch[0..n-1]" must be live when "*result" is used. // If an error was encountered, returns a non-OK status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // REQUIRES: External synchronization virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) = 0; @@ -584,7 +656,7 @@ class FSSequentialFile { const IOOptions& /*options*/, Slice* /*result*/, char* /*scratch*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("PositionedRead"); } // If you're adding methods here, remember to add them to @@ -596,7 +668,8 @@ struct FSReadRequest { // File offset in bytes uint64_t offset; - // Length to read in bytes + // Length to read in bytes. `result` only returns fewer bytes if end of file + // is hit (or `status` is not OK). size_t len; // A buffer that MultiRead() can optionally place data in. It can @@ -626,6 +699,10 @@ class FSRandomAccessFile { // "*result" is used. If an error was encountered, returns a non-OK // status. // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, @@ -638,7 +715,7 @@ class FSRandomAccessFile { virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("Prefetch"); } // Read a bunch of blocks as described by reqs. The blocks can @@ -703,7 +780,7 @@ class FSRandomAccessFile { }; // A data structure brings the data verification information, which is -// used togther with data being written to a file. +// used together with data being written to a file. struct DataVerificationInfo { // checksum of the data being written. Slice checksum; @@ -731,15 +808,19 @@ class FSWritableFile { virtual ~FSWritableFile() {} // Append data to the end of the file - // Note: A WriteabelFile object must support either Append or + // Note: A WriteableFile object must support either Append or // PositionedAppend, so the users cannot mix the two. virtual IOStatus Append(const Slice& data, const IOOptions& options, IODebugContext* dbg) = 0; - // EXPERIMENTAL / CURRENTLY UNUSED - // Append data with verification information + // Append data with verification information. // Note that this API change is experimental and it might be changed in - // the future. Currently, RocksDB does not use this API. + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). virtual IOStatus Append(const Slice& data, const IOOptions& options, const DataVerificationInfo& /* verification_info */, IODebugContext* dbg) { @@ -770,19 +851,23 @@ class FSWritableFile { uint64_t /* offset */, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("PositionedAppend"); } - // EXPERIMENTAL / CURRENTLY UNUSED // PositionedAppend data with verification information. // Note that this API change is experimental and it might be changed in - // the future. Currently, RocksDB does not use this API. + // the future. Currently, RocksDB only generates crc32c based checksum for + // the file writes when the checksum handoff option is set. + // Expected behavior: if the handoff_checksum_type in FileOptions (currently, + // ChecksumType::kCRC32C is set as default) is not supported by this + // FSWritableFile, the information in DataVerificationInfo can be ignored + // (i.e. does not perform checksum verification). virtual IOStatus PositionedAppend( const Slice& /* data */, uint64_t /* offset */, const IOOptions& /*options*/, const DataVerificationInfo& /* verification_info */, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported(); + return IOStatus::NotSupported("PositionedAppend"); } // Truncate is necessary to trim the file to the correct size @@ -954,6 +1039,11 @@ class FSRandomRWFile { // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. + // + // After call, result->size() < n only if end of file has been + // reached (or non-OK status). Read might fail if called again after + // first result->size() < n. + // // Returns Status::OK() on success. virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -1048,7 +1138,8 @@ class FSDirectory { class FileSystemWrapper : public FileSystem { public: // Initialize an EnvWrapper that delegates all calls to *t - explicit FileSystemWrapper(std::shared_ptr t) : target_(t) {} + explicit FileSystemWrapper(const std::shared_ptr& t) + : target_(t) {} ~FileSystemWrapper() override {} const char* Name() const override { return target_->Name(); } @@ -1229,6 +1320,11 @@ class FileSystemWrapper : public FileSystem { const ImmutableDBOptions& db_options) const override { return target_->OptimizeForCompactionTableRead(file_options, db_options); } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, uint64_t* diskfree, IODebugContext* dbg) override { return target_->GetFreeSpace(path, options, diskfree, dbg); diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 3cd85a22601..fc1985d323e 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -21,6 +21,7 @@ #include +#include #include #include #include @@ -28,6 +29,7 @@ #include "rocksdb/advanced_options.h" #include "rocksdb/status.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { @@ -40,33 +42,48 @@ class FilterBitsBuilder { public: virtual ~FilterBitsBuilder() {} - // Add Key to filter, you could use any way to store the key. - // Such as: storing hashes or original keys - // Keys are in sorted order and duplicated keys are possible. + // Add a key (or prefix) to the filter. Typically, a builder will keep + // a set of 64-bit key hashes and only build the filter in Finish + // when the final number of keys is known. Keys are added in sorted order + // and duplicated keys are possible, so typically, the builder will + // only add this key if its hash is different from the most recently + // added. virtual void AddKey(const Slice& key) = 0; + // Called by RocksDB before Finish to populate + // TableProperties::num_filter_entries, so should represent the + // number of unique keys (and/or prefixes) added, but does not have + // to be exact. + virtual size_t EstimateEntriesAdded() { + // Default implementation for backward compatibility. + // 0 conspicuously stands for "unknown". + return 0; + } + // Generate the filter using the keys that are added // The return value of this function would be the filter bits, // The ownership of actual data is set to buf virtual Slice Finish(std::unique_ptr* buf) = 0; - // Calculate num of keys that can be added and generate a filter - // <= the specified number of bytes. -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4702) // unreachable code -#endif - virtual int CalculateNumEntry(const uint32_t /*bytes*/) { -#ifndef ROCKSDB_LITE - throw std::runtime_error("CalculateNumEntry not Implemented"); -#else - abort(); -#endif - return 0; + // Approximate the number of keys that can be added and generate a filter + // <= the specified number of bytes. Callers (including RocksDB) should + // only use this result for optimizing performance and not as a guarantee. + // This default implementation is for compatibility with older custom + // FilterBitsBuilders only implementing deprecated CalculateNumEntry. + virtual size_t ApproximateNumEntries(size_t bytes) { + bytes = std::min(bytes, size_t{0xffffffff}); + return static_cast(CalculateNumEntry(static_cast(bytes))); + } + + // Old, DEPRECATED version of ApproximateNumEntries. This is not + // called by RocksDB except as the default implementation of + // ApproximateNumEntries for API compatibility. + virtual int CalculateNumEntry(const uint32_t bytes) { + // DEBUG: ideally should not rely on this implementation + assert(false); + // RELEASE: something reasonably conservative: 2 bytes per entry + return static_cast(bytes / 2); } -#if defined(_MSC_VER) -#pragma warning(pop) -#endif }; // A class that checks if a key can be in filter @@ -96,18 +113,32 @@ struct FilterBuildingContext { // Options for the table being built const BlockBasedTableOptions& table_options; - // Name of the column family for the table (or empty string if unknown) - std::string column_family_name; - - // The compactions style in effect for the table + // BEGIN from (DB|ColumnFamily)Options in effect at table creation time CompactionStyle compaction_style = kCompactionStyleLevel; - // The table level at time of constructing the SST file, or -1 if unknown. - // (The table file could later be used at a different level.) - int level_at_creation = -1; + // Number of LSM levels, or -1 if unknown + int num_levels = -1; // An optional logger for reporting errors, warnings, etc. Logger* info_log = nullptr; + // END from (DB|ColumnFamily)Options + + // Name of the column family for the table (or empty string if unknown) + // TODO: consider changing to Slice + std::string column_family_name; + + // The table level at time of constructing the SST file, or -1 if unknown + // or N/A as in SstFileWriter. (The table file could later be used at a + // different level.) + int level_at_creation = -1; + + // True if known to be going into bottommost sorted run for applicable + // key range (which might not even be last level with data). False + // otherwise. + bool is_bottommost = false; + + // Reason for creating the file with the filter + TableFileCreationReason reason = TableFileCreationReason::kMisc; }; // We add a new format of filter block called full filter block @@ -212,4 +243,35 @@ class FilterPolicy { // trailing spaces in keys. extern const FilterPolicy* NewBloomFilterPolicy( double bits_per_key, bool use_block_based_builder = false); + +// A new Bloom alternative that saves about 30% space compared to +// Bloom filters, with similar query times but roughly 3-4x CPU time +// and 3x temporary space usage during construction. For example, if +// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same +// 0.95% FP rate as Bloom filter but only using about 7 bits per key. +// +// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier +// versions reading the data will behave as if no filter was used +// (degraded performance until compaction rebuilds filters). All +// built-in FilterPolicies (Bloom or Ribbon) are able to read other +// kinds of built-in filters. +// +// Note: the current Ribbon filter schema uses some extra resources +// when constructing very large filters. For example, for 100 million +// keys in a single filter (one SST file without partitioned filters), +// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom. +// However, the savings in filter space from just ~60 open SST files +// makes up for the additional temporary memory use. +// +// Also consider using optimize_filters_for_memory to save filter +// memory. +extern const FilterPolicy* NewRibbonFilterPolicy( + double bloom_equivalent_bits_per_key); + +// Old name +inline const FilterPolicy* NewExperimentalRibbonFilterPolicy( + double bloom_equivalent_bits_per_key) { + return NewRibbonFilterPolicy(bloom_equivalent_bits_per_key); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/functor_wrapper.h b/include/rocksdb/functor_wrapper.h new file mode 100644 index 00000000000..c5f7414b1a8 --- /dev/null +++ b/include/rocksdb/functor_wrapper.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace detail { +template +struct IndexSequence {}; + +template +struct IndexSequenceHelper + : public IndexSequenceHelper {}; + +template +struct IndexSequenceHelper<0U, Next...> { + using type = IndexSequence; +}; + +template +using make_index_sequence = typename IndexSequenceHelper::type; + +template +void call(Function f, Tuple t, IndexSequence) { + f(std::get(t)...); +} + +template +void call(Function f, Tuple t) { + static constexpr auto size = std::tuple_size::value; + call(f, t, make_index_sequence{}); +} +} // namespace detail + +template +class FunctorWrapper { + public: + explicit FunctorWrapper(std::function functor, Args &&...args) + : functor_(std::move(functor)), args_(std::forward(args)...) {} + + void invoke() { detail::call(functor_, args_); } + + private: + std::function functor_; + std::tuple args_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index b31b6d70a4f..0f6ab692ea5 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -50,7 +50,15 @@ struct IOStatsContext { uint64_t cpu_read_nanos; }; -// Get Thread-local IOStatsContext object pointer +// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global, +// non-thread-local IOStatsContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, +// a) if thread-local is supported on the platform, then a pointer to +// a thread-local IOStatsContext object will be returned. +// b) if thread-local is NOT supported, then compilation will fail. +// +// This function never returns nullptr. IOStatsContext* get_iostats_context(); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index aa2f2a3ff4c..eb3f42acd69 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -45,7 +45,6 @@ class Iterator : public Cleanable { // Position at the last key in the source. The iterator is // Valid() after this call iff the source is not empty. - // Currently incompatible with user timestamp. virtual void SeekToLast() = 0; // Position at the first key in the source that at or past target. @@ -60,7 +59,7 @@ class Iterator : public Cleanable { // Position at the last key in the source that at or before target. // The iterator is Valid() after this call iff the source contains // an entry that comes at or before target. - // Currently incompatible with user timestamp. + // Target does not contain timestamp. virtual void SeekForPrev(const Slice& target) = 0; // Moves to the next entry in the source. After this call, Valid() is @@ -70,7 +69,6 @@ class Iterator : public Cleanable { // Moves to the previous entry in the source. After this call, Valid() is // true iff the iterator was not positioned at the first entry in source. - // Currently incompatible with user timestamp. // REQUIRES: Valid() virtual void Prev() = 0; diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index e90a8707af4..dec3e442074 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -16,6 +16,7 @@ #include "rocksdb/compression_type.h" #include "rocksdb/status.h" #include "rocksdb/table_properties.h" +#include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { @@ -27,13 +28,6 @@ class ColumnFamilyHandle; class Status; struct CompactionJobStats; -enum class TableFileCreationReason { - kFlush, - kCompaction, - kRecovery, - kMisc, -}; - struct TableFileCreationBriefInfo { // the name of the database where the file was created std::string db_name; @@ -118,8 +112,13 @@ enum class FlushReason : int { // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable // will not be called to avoid many small immutable memtables. kErrorRecoveryRetryFlush = 0xc, + kWalFull = 0xd, }; +// TODO: In the future, BackgroundErrorReason will only be used to indicate +// why the BG Error is happening (e.g., flush, compaction). We may introduce +// other data structure to indicate other essential information such as +// the file type (e.g., Manifest, SST) and special context. enum class BackgroundErrorReason { kFlush, kCompaction, @@ -127,6 +126,7 @@ enum class BackgroundErrorReason { kMemTable, kManifestWrite, kFlushNoWAL, + kManifestWriteNoWAL, }; enum class WriteStallCondition { @@ -333,13 +333,18 @@ struct ExternalFileIngestionInfo { // be used as a building block for developing custom features such as // stats-collector or external compaction algorithm. // -// Note that callback functions should not run for an extended period of -// time before the function returns, otherwise RocksDB may be blocked. -// For example, it is not suggested to do DB::CompactFiles() (as it may -// run for a long while) or issue many of DB::Put() (as Put may be blocked -// in certain cases) in the same thread in the EventListener callback. -// However, doing DB::CompactFiles() and DB::Put() in another thread is -// considered safe. +// IMPORTANT +// Because compaction is needed to resolve a "writes stopped" condition, +// calling or waiting for any blocking DB write function (no_slowdown=false) +// from a compaction-related listener callback can hang RocksDB. For DB +// writes from a callback we recommend a WriteBatch and no_slowdown=true, +// because the WriteBatch can accumulate writes for later in case DB::Write +// returns Status::Incomplete. Similarly, calling CompactRange or similar +// could hang by waiting for a background worker that is occupied until the +// callback returns. +// +// Otherwise, callback functions should not run for an extended period of +// time before the function returns, because this will slow RocksDB. // // [Threading] All EventListener callback will be called using the // actual thread that involves in that specific event. For example, it diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index 60256a9772b..51442239a9e 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -45,31 +45,31 @@ struct JemallocAllocatorOptions { bool limit_tcache_size = false; // Lower bound of allocation size to use tcache, if limit_tcache_size=true. - // When used with block cache, it is recommneded to set it to block_size/4. + // When used with block cache, it is recommended to set it to block_size/4. size_t tcache_size_lower_bound = 1024; // Upper bound of allocation size to use tcache, if limit_tcache_size=true. - // When used with block cache, it is recommneded to set it to block_size. + // When used with block cache, it is recommended to set it to block_size. size_t tcache_size_upper_bound = 16 * 1024; }; -// Generate memory allocators which allocates through Jemalloc and utilize -// MADV_DONTDUMP through madvice to exclude cache items from core dump. +// Generate memory allocator which allocates through Jemalloc and utilize +// MADV_DONTDUMP through madvise to exclude cache items from core dump. // Applications can use the allocator with block cache to exclude block cache // usage from core dump. // // Implementation details: -// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all -// allocations of the JemallocNodumpAllocator is through the same arena. -// The memory allocator hooks memory allocation of the arena, and call -// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from -// core dump. Side benefit of using single arena would be reduce of jemalloc -// metadata for some workload. +// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all +// allocations of the JemallocNodumpAllocator are through the same arena. +// The memory allocator hooks memory allocation of the arena, and calls +// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from +// core dump. Side benefit of using single arena would be reduction of jemalloc +// metadata for some workloads. // // To mitigate mutex contention for using one single arena, jemalloc tcache // (thread-local cache) is enabled to cache unused allocations for future use. -// The tcache normally incur 0.5M extra memory usage per-thread. The usage -// can be reduce by limitting allocation sizes to cache. +// The tcache normally incurs 0.5M extra memory usage per-thread. The usage +// can be reduced by limiting allocation sizes to cache. extern Status NewJemallocNodumpAllocator( JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 49723264a59..b8701135d3e 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -62,7 +62,7 @@ class MemTableRep { typedef ROCKSDB_NAMESPACE::Slice DecodedType; virtual DecodedType decode_key(const char* key) const { - // The format of key is frozen and can be terated as a part of the API + // The format of key is frozen and can be treated as a part of the API // contract. Refer to MemTable::Add for details. return GetLengthPrefixedSlice(key); } @@ -120,7 +120,7 @@ class MemTableRep { return true; } - // Same as ::InsertWithHint, but allow concurrnet write + // Same as ::InsertWithHint, but allow concurrent write // // If hint points to nullptr, a new hint will be allocated on heap, otherwise // the hint will be updated to reflect the last insert location. The hint is diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index a0a99fc4a99..0b04ec3102b 100755 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -109,7 +109,7 @@ class MergeOperator { Slice& existing_operand; }; - // This function applies a stack of merge operands in chrionological order + // This function applies a stack of merge operands in chronological order // on top of an existing value. There are two ways in which this method is // being used: // a) During Get() operation, it used to calculate the final value of a key @@ -176,7 +176,7 @@ class MergeOperator { // PartialMergeMulti should combine them into a single merge operation that is // saved into *new_value, and then it should return true. *new_value should // be constructed such that a call to DB::Merge(key, *new_value) would yield - // the same result as subquential individual calls to DB::Merge(key, operand) + // the same result as sequential individual calls to DB::Merge(key, operand) // for each operand in operand_list from front() to back(). // // The string that new_value is pointing to will be empty. diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 9a64a7a8f68..b515c51a1e5 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -11,6 +11,7 @@ #include #include +#include "rocksdb/options.h" #include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { @@ -62,6 +63,7 @@ struct SstFileMetaData { being_compacted(false), num_entries(0), num_deletions(0), + temperature(Temperature::kUnknown), oldest_blob_file_number(0), oldest_ancester_time(0), file_creation_time(0) {} @@ -71,7 +73,8 @@ struct SstFileMetaData { SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, const std::string& _largestkey, uint64_t _num_reads_sampled, - bool _being_compacted, uint64_t _oldest_blob_file_number, + bool _being_compacted, Temperature _temperature, + uint64_t _oldest_blob_file_number, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, std::string& _file_checksum, std::string& _file_checksum_func_name) @@ -87,6 +90,7 @@ struct SstFileMetaData { being_compacted(_being_compacted), num_entries(0), num_deletions(0), + temperature(_temperature), oldest_blob_file_number(_oldest_blob_file_number), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), @@ -112,18 +116,21 @@ struct SstFileMetaData { uint64_t num_entries; uint64_t num_deletions; + // This feature is experimental and subject to change. + Temperature temperature; + uint64_t oldest_blob_file_number; // The id of the oldest blob file // referenced by the file. // An SST file may be generated by compactions whose input files may // in turn be generated by earlier compactions. The creation time of the - // oldest SST file that is the compaction ancester of this file. - // The timestamp is provided Env::GetCurrentTime(). + // oldest SST file that is the compaction ancestor of this file. + // The timestamp is provided SystemClock::GetCurrentTime(). // 0 if the information is not available. // // Note: for TTL blob files, it contains the start of the expiration range. uint64_t oldest_ancester_time; - // Timestamp when the SST file is created, provided by Env::GetCurrentTime(). - // 0 if the information is not available. + // Timestamp when the SST file is created, provided by + // SystemClock::GetCurrentTime(). 0 if the information is not available. uint64_t file_creation_time; // The checksum of a SST file, the value is decided by the file content and diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 1a621eec656..86343958abc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -20,11 +20,14 @@ #include "rocksdb/advanced_options.h" #include "rocksdb/comparator.h" #include "rocksdb/compression_type.h" +#include "rocksdb/customizable.h" +#include "rocksdb/data_structure.h" #include "rocksdb/env.h" #include "rocksdb/file_checksum.h" #include "rocksdb/listener.h" #include "rocksdb/pre_release_callback.h" #include "rocksdb/sst_partitioner.h" +#include "rocksdb/types.h" #include "rocksdb/universal_compaction.h" #include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" @@ -58,6 +61,8 @@ class FileSystem; struct Options; struct DbPath; +using FileTypeSet = SmallEnumSet; + struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // The function recovers options to a previous version. Only 4.6 or later // versions are supported. @@ -125,9 +130,10 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Allows an application to modify/delete a key-value during background // compaction. // - // If the client requires a new compaction filter to be used for different - // compaction runs, it can specify compaction_filter_factory instead of this - // option. The client should specify only one of the two. + // If the client requires a new `CompactionFilter` to be used for different + // compaction runs and/or requires a `CompactionFilter` for table file + // creations outside of compaction, it can specify compaction_filter_factory + // instead of this option. The client should specify only one of the two. // compaction_filter takes precedence over compaction_filter_factory if // client specifies both. // @@ -138,12 +144,21 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Default: nullptr const CompactionFilter* compaction_filter = nullptr; - // This is a factory that provides compaction filter objects which allow - // an application to modify/delete a key-value during background compaction. + // This is a factory that provides `CompactionFilter` objects which allow + // an application to modify/delete a key-value during table file creation. + // + // Unlike the `compaction_filter` option, which is used when compaction + // creates a table file, this factory allows using a `CompactionFilter` when a + // table file is created for various reasons. The factory can decide what + // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by + // default the decision is to use a `CompactionFilter` for + // `TableFileCreationReason::kCompaction` only. // - // A new filter will be created on each compaction run. If multithreaded - // compaction is being used, each created CompactionFilter will only be used - // from a single thread and so does not need to be thread-safe. + // Each thread of work involving creating table files will create a new + // `CompactionFilter` when it will be used according to the above + // `TableFileCreationReason`-based decision. This allows the application to + // know about the different ongoing threads of work and makes it unnecessary + // for `CompactionFilter` to provide thread-safety. // // Default: nullptr std::shared_ptr compaction_filter_factory = nullptr; @@ -349,6 +364,37 @@ struct DbPath { DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} }; +extern const char* kHostnameForDbHostId; + +enum class CompactionServiceJobStatus : char { + kSuccess, + kFailure, + kUseLocal, // TODO: Add support for use local compaction +}; + +class CompactionService : public Customizable { + public: + static const char* Type() { return "CompactionService"; } + + // Returns the name of this compaction service. + virtual const char* Name() const = 0; + + // Start the compaction with input information, which can be passed to + // `DB::OpenAndCompact()`. + // job_id is pre-assigned, it will be reset after DB re-open. + // TODO: sub-compaction is not supported, as they will have the same job_id, a + // sub-compaction id might be added + virtual CompactionServiceJobStatus Start( + const std::string& compaction_service_input, int job_id) = 0; + + // Wait compaction to be finish. + // TODO: Add output path override + virtual CompactionServiceJobStatus WaitForComplete( + int job_id, std::string* compaction_service_result) = 0; + + virtual ~CompactionService() {} +}; + struct DBOptions { // The function recovers options to the option as in version 4.6. DBOptions* OldDefaults(int rocksdb_major_version = 4, @@ -391,6 +437,23 @@ struct DBOptions { // Default: true bool paranoid_checks = true; + // If true, during memtable flush, RocksDB will validate total entries + // read in flush, and compare with counter inserted into it. + // The option is here to turn the feature off in case this new validation + // feature has a bug. + // Default: true + bool flush_verify_memtable_count = true; + + // If true, the log numbers and sizes of the synced WALs are tracked + // in MANIFEST, then during DB recovery, if a synced WAL is missing + // from disk, or the WAL's size does not match the recorded size in + // MANIFEST, an error will be reported and the recovery will be aborted. + // + // Note that this option does not work with secondary instance. + // + // Default: false + bool track_and_verify_wals_in_manifest = false; + // Use the specified object to interact with the environment, // e.g. to read/write files, schedule background work, etc. In the near // future, support for doing storage operations such as read/write files @@ -821,7 +884,7 @@ struct DBOptions { // Allows OS to incrementally sync files to disk while they are being // written, asynchronously, in the background. This operation can be used // to smooth out write I/Os over time. Users shouldn't rely on it for - // persistency guarantee. + // persistence guarantee. // Issue one request for every bytes_per_sync written. 0 turns it off. // // You may consider using rate_limiter to regulate write rate to device. @@ -1179,6 +1242,38 @@ struct DBOptions { // // Default: false bool disable_manifest_sync = false; + + // A string identifying the machine hosting the DB. This + // will be written as a property in every SST file written by the DB (or + // by offline writers such as SstFileWriter and RepairDB). It can be useful + // for troubleshooting in memory corruption caused by a failing host when + // writing a file, by tracing back to the writing host. These corruptions + // may not be caught by the checksum since they happen before checksumming. + // If left as default, the table writer will substitute it with the actual + // hostname when writing the SST file. If set to an empty string, the + // property will not be written to the SST file. + // + // Default: hostname + std::string db_host_id = kHostnameForDbHostId; + + // Use this if your DB want to enable checksum handoff for specific file + // types writes. Make sure that the File_system you use support the + // crc32c checksum verification + // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile. + // NOTE: currently RocksDB only generates crc32c based checksum for the + // handoff. If the storage layer has different checksum support, user + // should enble this set as empty. Otherwise,it may cause unexpected + // write failures. + FileTypeSet checksum_handoff_file_types; + + // EXPERIMENTAL + // CompactionService is a feature allows the user to run compactions on a + // different host or process, which offloads the background load from the + // primary host. + // It's an experimental feature, the interface will be changed without + // backward/forward compatibility support for now. Some known issues are still + // under development. + std::shared_ptr compaction_service = nullptr; }; // Options to control the behavior of a database (passed to DB::Open) @@ -1253,19 +1348,28 @@ struct ReadOptions { // Default: nullptr const Slice* iterate_lower_bound; - // "iterate_upper_bound" defines the extent upto which the forward iterator + // "iterate_upper_bound" defines the extent up to which the forward iterator // can returns entries. Once the bound is reached, Valid() will be false. // "iterate_upper_bound" is exclusive ie the bound value is - // not a valid entry. If prefix_extractor is not null, the Seek target - // and iterate_upper_bound need to have the same prefix. - // This is because ordering is not guaranteed outside of prefix domain. + // not a valid entry. If prefix_extractor is not null: + // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used + // to infer whether prefix iterating (e.g. applying prefix bloom filter) + // can be used within RocksDB. This is done by comparing + // iterate_upper_bound with the seek key. + // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes + // effect if it shares the same prefix as the seek key. If + // iterate_upper_bound is outside the prefix of the seek key, then keys + // returned outside the prefix range will be undefined, just as if + // iterate_upper_bound = null. + // If iterate_upper_bound is not null, SeekToLast() will position the iterator + // at the first key smaller than iterate_upper_bound. // // Default: nullptr const Slice* iterate_upper_bound; // RocksDB does auto-readahead for iterators on noticing more than two reads // for a table file. The readahead starts at 8KB and doubles on every - // additional read upto 256KB. + // additional read up to 256KB. // This option can help if most of the range scans are large, and if it is // determined that a larger readahead than that enabled by auto-readahead is // needed. @@ -1323,7 +1427,7 @@ struct ReadOptions { // When true, by default use total_order_seek = true, and RocksDB can // selectively enable prefix seek mode if won't generate a different result // from total_order_seek, based on seek key, and iterator upper bound. - // Not suppported in ROCKSDB_LITE mode, in the way that even with value true + // Not supported in ROCKSDB_LITE mode, in the way that even with value true // prefix mode is not used. // Default: false bool auto_prefix_mode; @@ -1399,7 +1503,7 @@ struct ReadOptions { // A timeout in microseconds to be passed to the underlying FileSystem for // reads. As opposed to deadline, this determines the timeout for each // individual file read request. If a MultiGet/Get/Seek/Next etc call - // results in multiple reads, each read can last upto io_timeout us. + // results in multiple reads, each read can last up to io_timeout us. std::chrono::microseconds io_timeout; // It limits the maximum cumulative value size of the keys in batch while @@ -1453,7 +1557,7 @@ struct WriteOptions { bool no_slowdown; // If true, this write request is of lower priority if compaction is - // behind. In this case, no_slowdown = true, the request will be cancelled + // behind. In this case, no_slowdown = true, the request will be canceled // immediately with Status::Incomplete() returned. Otherwise, it will be // slowed down. The slowdown value is determined by RocksDB to guarantee // it introduces minimum impacts to high priority writes. @@ -1572,6 +1676,12 @@ struct CompactRangeOptions { bool allow_write_stall = false; // If > 0, it will replace the option in the DBOptions for this compaction. uint32_t max_subcompactions = 0; + // Set user-defined timestamp low bound, the data with older timestamp than + // low bound maybe GCed by compaction. Default: nullptr + Slice* full_history_ts_low = nullptr; + + // Allows cancellation of an in-progress manual compaction. + std::atomic* canceled = nullptr; }; // IngestExternalFileOptions is used by IngestExternalFile() @@ -1591,7 +1701,7 @@ struct IngestExternalFileOptions { bool allow_blocking_flush = true; // Set to true if you would like duplicate keys in the file being ingested // to be skipped rather than overwriting existing data under that key. - // Usecase: back-fill of some historical data in the database without + // Use case: back-fill of some historical data in the database without // over-writing existing newer version of data. // This option could only be used if the DB has been running // with allow_ingest_behind=true since the dawn of time. @@ -1631,7 +1741,7 @@ struct IngestExternalFileOptions { // will be ignored; 2) If DB enable the checksum function, we calculate the // sst file checksum after the file is moved or copied and compare the // checksum and checksum name. If checksum or checksum function name does - // not match, ingestion will be failed. If the verification is sucessful, + // not match, ingestion will be failed. If the verification is successful, // checksum and checksum function name will be stored in Manifest. // If this option is set to FALSE, 1) if DB does not enable checksum, // the ingested checksum information will be ignored; 2) if DB enable the @@ -1696,4 +1806,20 @@ struct SizeApproximationOptions { double files_size_error_margin = -1.0; }; +struct CompactionServiceOptionsOverride { + // Currently pointer configurations are not passed to compaction service + // compaction so the user needs to set it. It will be removed once pointer + // configuration passing is supported. + Env* env = Env::Default(); + std::shared_ptr file_checksum_gen_factory = nullptr; + + const Comparator* comparator = BytewiseComparator(); + std::shared_ptr merge_operator = nullptr; + const CompactionFilter* compaction_filter = nullptr; + std::shared_ptr compaction_filter_factory = nullptr; + std::shared_ptr prefix_extractor = nullptr; + std::shared_ptr table_factory; + std::shared_ptr sst_partitioner_factory = nullptr; +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 3d61000cc57..699f57344f1 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -57,7 +57,7 @@ struct PerfContext { // enable per level perf context and allocate storage for PerfContextByLevel void EnablePerLevelPerfContext(); - // temporarily disable per level perf contxt by setting the flag to false + // temporarily disable per level perf context by setting the flag to false void DisablePerLevelPerfContext(); // free the space for PerfContextByLevel, also disable per level perf context @@ -230,8 +230,15 @@ struct PerfContext { bool per_level_perf_context_enabled = false; }; -// Get Thread-local PerfContext object pointer -// if defined(NPERF_CONTEXT), then the pointer is not thread-local +// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global, +// non-thread-local PerfContext object will be returned. Attempts to update +// this object will be ignored, and reading from it will also be no-op. +// Otherwise, +// a) if thread-local is supported on the platform, then a pointer to +// a thread-local PerfContext object will be returned. +// b) if thread-local is NOT supported, then compilation will fail. +// +// This function never returns nullptr. PerfContext* get_perf_context(); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h new file mode 100644 index 00000000000..221b3e5f231 --- /dev/null +++ b/include/rocksdb/secondary_cache.h @@ -0,0 +1,79 @@ +// Copyright (c) 2021, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include + +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/slice.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// A handle for lookup result. The handle may not be immediately ready or +// have a valid value. The caller must call isReady() to determine if its +// ready, and call Wait() in order to block until it becomes ready. +// The caller must call value() after it becomes ready to determine if the +// handle successfullly read the item. +class SecondaryCacheResultHandle { + public: + virtual ~SecondaryCacheResultHandle() {} + + // Returns whether the handle is ready or not + virtual bool IsReady() = 0; + + // Block until handle becomes ready + virtual void Wait() = 0; + + // Return the value. If nullptr, it means the lookup was unsuccessful + virtual void* Value() = 0; + + // Return the size of value + virtual size_t Size() = 0; +}; + +// SecondaryCache +// +// Cache interface for caching blocks on a secondary tier (which can include +// non-volatile media, or alternate forms of caching such as compressed data) +class SecondaryCache { + public: + virtual ~SecondaryCache() {} + + virtual std::string Name() = 0; + + static const std::string Type() { return "SecondaryCache"; } + + // Insert the given value into this cache. The value is not written + // directly. Rather, the SaveToCallback provided by helper_cb will be + // used to extract the persistable data in value, which will be written + // to this tier. The implementation may or may not write it to cache + // depending on the admission control policy, even if the return status is + // success. + virtual Status Insert(const Slice& key, void* value, + const Cache::CacheItemHelper* helper) = 0; + + // Lookup the data for the given key in this cache. The create_cb + // will be used to create the object. The handle returned may not be + // ready yet, unless wait=true, in which case Lookup() will block until + // the handle is ready + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0; + + // At the discretion of the implementation, erase the data associated + // with key + virtual void Erase(const Slice& key) = 0; + + // Wait for a collection of handles to become ready + virtual void WaitAll(std::vector handles) = 0; + + virtual std::string GetPrintableOptions() const = 0; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 54f61f9d2a4..589636744ee 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -62,7 +62,7 @@ class SliceTransform { virtual bool InRange(const Slice& /*dst*/) const { return false; } // Some SliceTransform will have a full length which can be used to - // determine if two keys are consecuitive. Can be disabled by always + // determine if two keys are consecutive. Can be disabled by always // returning 0 virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h index 350dec7a8bb..5aae88dc1ed 100644 --- a/include/rocksdb/sst_file_manager.h +++ b/include/rocksdb/sst_file_manager.h @@ -19,17 +19,16 @@ namespace ROCKSDB_NAMESPACE { class Env; class Logger; -// SstFileManager is used to track SST files in the DB and control their -// deletion rate. -// All SstFileManager public functions are thread-safe. +// SstFileManager is used to track SST and blob files in the DB and control +// their deletion rate. All SstFileManager public functions are thread-safe. // SstFileManager is not extensible. class SstFileManager { public: virtual ~SstFileManager() {} // Update the maximum allowed space that should be used by RocksDB, if - // the total size of the SST files exceeds max_allowed_space, writes to - // RocksDB will fail. + // the total size of the SST and blob files exceeds max_allowed_space, writes + // to RocksDB will fail. // // Setting max_allowed_space to 0 will disable this feature; maximum allowed // space will be infinite (Default value). @@ -43,14 +42,14 @@ class SstFileManager { // other background functions may continue, such as logging and flushing. virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0; - // Return true if the total size of SST files exceeded the maximum allowed - // space usage. + // Return true if the total size of SST and blob files exceeded the maximum + // allowed space usage. // // thread-safe. virtual bool IsMaxAllowedSpaceReached() = 0; - // Returns true if the total size of SST files as well as estimated size - // of ongoing compactions exceeds the maximums allowed space usage. + // Returns true if the total size of SST and blob files as well as estimated + // size of ongoing compactions exceeds the maximums allowed space usage. virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0; // Return the total size of all tracked files. @@ -87,7 +86,7 @@ class SstFileManager { }; // Create a new SstFileManager that can be shared among multiple RocksDB -// instances to track SST file and control there deletion rate. +// instances to track SST and blob files and control there deletion rate. // Even though SstFileManager don't track WAL files but it still control // there deletion rate. // diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index 88b1e42cec1..ec436c32edf 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -90,6 +90,9 @@ class SstFileWriter { // ascending order. // If unsafe_disable_sync is false, SstFileWriter will not sync new sst file // on close. + // The `skip_filters` option is DEPRECATED and could be removed in the + // future. Use `BlockBasedTableOptions::filter_policy` to control filter + // generation. SstFileWriter(const EnvOptions& env_options, const Options& options, ColumnFamilyHandle* column_family = nullptr, bool invalidate_page_cache = true, diff --git a/include/rocksdb/sst_partitioner.h b/include/rocksdb/sst_partitioner.h index 5d181958f55..1ac16b49e6e 100644 --- a/include/rocksdb/sst_partitioner.h +++ b/include/rocksdb/sst_partitioner.h @@ -51,12 +51,12 @@ class SstPartitioner { // It is called for all keys in compaction. When partitioner want to create // new SST file it needs to return true. It means compaction job will finish // current SST file where last key is "prev_user_key" parameter and start new - // SST file where first key is "current_user_key". Returns decission if + // SST file where first key is "current_user_key". Returns decision if // partition boundary was detected and compaction should create new file. virtual PartitionerResult ShouldPartition( const PartitionerRequest& request) = 0; - // Called with smallest and largest keys in SST file when compation try to do + // Called with smallest and largest keys in SST file when compaction try to do // trivial move. Returns true is partitioner allows to do trivial move. virtual bool CanDoTrivialMove(const Slice& smallest_user_key, const Slice& largest_user_key) = 0; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 98b4fb970de..8fc5a2eedb2 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -117,7 +117,7 @@ enum Tickers : uint32_t { COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted. // Deletions obsoleted before bottom level due to file gap optimization. COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE, - // If a compaction was cancelled in sfm to prevent ENOSPC + // If a compaction was canceled in sfm to prevent ENOSPC COMPACTION_CANCELLED, // Number of keys written to the database via the Put and Write call's @@ -183,7 +183,7 @@ enum Tickers : uint32_t { // over large number of keys with same userkey. NUMBER_OF_RESEEKS_IN_ITERATION, - // Record the number of calls to GetUpadtesSince. Useful to keep track of + // Record the number of calls to GetUpdatesSince. Useful to keep track of // transaction log iterator refreshes GET_UPDATES_SINCE_CALLS, BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache @@ -374,6 +374,21 @@ enum Tickers : uint32_t { // # of files deleted immediately by sst file manger through delete scheduler. FILES_DELETED_IMMEDIATELY, + // The counters for error handler, not that, bg_io_error is the subset of + // bg_error and bg_retryable_io_error is the subset of bg_io_error + ERROR_HANDLER_BG_ERROR_COUNT, + ERROR_HANDLER_BG_IO_ERROR_COUNT, + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + ERROR_HANDLER_AUTORESUME_COUNT, + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + + // Statistics for memtable garbage collection: + // Raw bytes of data (payload) present on memtable at flush time. + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + // Outdated bytes of data present on memtable at flush time. + MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + TICKER_ENUM_MAX }; @@ -438,7 +453,7 @@ enum Histograms : uint32_t { BLOB_DB_VALUE_SIZE, // BlobDB Put/PutWithTTL/PutUntil/Write latency. BLOB_DB_WRITE_MICROS, - // BlobDB Get lagency. + // BlobDB Get latency. BLOB_DB_GET_MICROS, // BlobDB MultiGet latency. BLOB_DB_MULTIGET_MICROS, @@ -472,6 +487,9 @@ enum Histograms : uint32_t { // Num of sst files read from file system per level. NUM_SST_READ_PER_LEVEL, + // Error handler statistics + ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + HISTOGRAM_ENUM_MAX, }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index bcc55e4fd02..1de2ebcb046 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -65,9 +65,11 @@ class Status { // In case of intentionally swallowing an error, user must explicitly call // this function. That way we are easily able to search the code to find where // error swallowing occurs. - void PermitUncheckedError() const { + inline void PermitUncheckedError() const { MarkChecked(); } + + inline void MustCheck() const { #ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; + checked_ = false; #endif // ROCKSDB_ASSERT_STATUS_CHECKED } @@ -92,9 +94,7 @@ class Status { }; Code code() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code_; } @@ -118,9 +118,7 @@ class Status { }; SubCode subcode() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return subcode_; } @@ -134,18 +132,18 @@ class Status { }; Status(const Status& s, Severity sev); + + Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg) + : Status(_code, _subcode, msg, "", _sev) {} + Severity severity() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return sev_; } // Returns a C style string indicating the message of the Status const char* getState() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return state_; } @@ -289,127 +287,95 @@ class Status { // Returns true iff the status indicates success. bool ok() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kOk; } // Returns true iff the status indicates success *with* something // overwritten bool IsOkOverwritten() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kOk && subcode() == kOverwritten; } // Returns true iff the status indicates a NotFound error. bool IsNotFound() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kNotFound; } // Returns true iff the status indicates a Corruption error. bool IsCorruption() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kCorruption; } // Returns true iff the status indicates a NotSupported error. bool IsNotSupported() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kNotSupported; } // Returns true iff the status indicates an InvalidArgument error. bool IsInvalidArgument() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kInvalidArgument; } // Returns true iff the status indicates an IOError. bool IsIOError() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kIOError; } // Returns true iff the status indicates an MergeInProgress. bool IsMergeInProgress() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kMergeInProgress; } // Returns true iff the status indicates Incomplete bool IsIncomplete() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kIncomplete; } // Returns true iff the status indicates Shutdown In progress bool IsShutdownInProgress() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kShutdownInProgress; } bool IsTimedOut() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kTimedOut; } bool IsAborted() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kAborted; } bool IsLockLimit() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kAborted && subcode() == kLockLimit; } // Returns true iff the status indicates that a resource is Busy and // temporarily could not be acquired. bool IsBusy() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kBusy; } bool IsDeadlock() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kBusy && subcode() == kDeadlock; } // Returns true iff the status indicated that the operation has Expired. bool IsExpired() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kExpired; } @@ -417,25 +383,19 @@ class Status { // This usually means that the operation failed, but may succeed if // re-attempted. bool IsTryAgain() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kTryAgain; } // Returns true iff the status indicates the proposed compaction is too large bool IsCompactionTooLarge() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kCompactionTooLarge; } // Returns true iff the status indicates Column Family Dropped bool IsColumnFamilyDropped() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return code() == kColumnFamilyDropped; } @@ -445,9 +405,7 @@ class Status { // with a specific subcode, enabling users to take the appropriate action // if needed bool IsNoSpace() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return (code() == kIOError) && (subcode() == kNoSpace); } @@ -455,9 +413,7 @@ class Status { // cases where we limit the memory used in certain operations (eg. the size // of a write batch) in order to avoid out of memory exceptions. bool IsMemoryLimit() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return (code() == kAborted) && (subcode() == kMemoryLimit); } @@ -466,9 +422,7 @@ class Status { // directory" error condition. A PathNotFound error is an I/O error with // a specific subcode, enabling users to take appropriate action if necessary bool IsPathNotFound() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return (code() == kIOError || code() == kNotFound) && (subcode() == kPathNotFound); } @@ -476,25 +430,19 @@ class Status { // Returns true iff the status indicates manual compaction paused. This // is caused by a call to PauseManualCompaction bool IsManualCompactionPaused() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); } // Returns true iff the status indicates a TxnNotPrepared error. bool IsTxnNotPrepared() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared); } // Returns true iff the status indicates a IOFenced error. bool IsIOFenced() const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); return (code() == kIOError) && (subcode() == kIOFenced); } @@ -519,33 +467,34 @@ class Status { explicit Status(Code _code, SubCode _subcode = kNone) : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {} - Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2); + Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2, + Severity sev = kNoError); Status(Code _code, const Slice& msg, const Slice& msg2) : Status(_code, kNone, msg, msg2) {} static const char* CopyState(const char* s); + + inline void MarkChecked() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + } }; inline Status::Status(const Status& s) : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - s.checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + s.MarkChecked(); state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline Status::Status(const Status& s, Severity sev) : code_(s.code_), subcode_(s.subcode_), sev_(sev) { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - s.checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + s.MarkChecked(); state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline Status& Status::operator=(const Status& s) { if (this != &s) { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - s.checked_ = true; - checked_ = false; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + s.MarkChecked(); + MustCheck(); code_ = s.code_; subcode_ = s.subcode_; sev_ = s.sev_; @@ -560,9 +509,7 @@ inline Status::Status(Status&& s) noexcept #endif : Status() { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - s.checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + s.MarkChecked(); *this = std::move(s); } @@ -572,10 +519,8 @@ inline Status& Status::operator=(Status&& s) #endif { if (this != &s) { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - s.checked_ = true; - checked_ = false; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + s.MarkChecked(); + MustCheck(); code_ = std::move(s.code_); s.code_ = kOk; subcode_ = std::move(s.subcode_); @@ -590,18 +535,14 @@ inline Status& Status::operator=(Status&& s) } inline bool Status::operator==(const Status& rhs) const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; - rhs.checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); + rhs.MarkChecked(); return (code_ == rhs.code_); } inline bool Status::operator!=(const Status& rhs) const { -#ifdef ROCKSDB_ASSERT_STATUS_CHECKED - checked_ = true; - rhs.checked_ = true; -#endif // ROCKSDB_ASSERT_STATUS_CHECKED + MarkChecked(); + rhs.MarkChecked(); return !(*this == rhs); } diff --git a/include/rocksdb/system_clock.h b/include/rocksdb/system_clock.h new file mode 100644 index 00000000000..e03d195ee24 --- /dev/null +++ b/include/rocksdb/system_clock.h @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include + +#include + +#ifdef _WIN32 +// Windows API macro interference +#undef GetCurrentTime +#endif + +namespace ROCKSDB_NAMESPACE { +struct ConfigOptions; + +// A SystemClock is an interface used by the rocksdb implementation to access +// operating system time-related functionality. +class SystemClock { + public: + virtual ~SystemClock() {} + + static const char* Type() { return "SystemClock"; } + + // The name of this system clock + virtual const char* Name() const = 0; + + // Return a default SystemClock suitable for the current operating + // system. + static const std::shared_ptr& Default(); + + // Returns the number of micro-seconds since some fixed point in time. + // It is often used as system time such as in GenericRateLimiter + // and other places so a port needs to return system time in order to work. + virtual uint64_t NowMicros() = 0; + + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros. + // In platform-specific implementations, NowNanos() should return time points + // that are MONOTONIC. + virtual uint64_t NowNanos() { return NowMicros() * 1000; } + + // Returns the number of micro-seconds of CPU time used by the current thread. + // 0 indicates not supported. + virtual uint64_t CPUMicros() { return 0; } + + // Returns the number of nano-seconds of CPU time used by the current thread. + // Default implementation simply relies on CPUMicros. + // 0 indicates not supported. + virtual uint64_t CPUNanos() { return CPUMicros() * 1000; } + + // Sleep/delay the thread for the prescribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). + // Only overwrites *unix_time on success. + virtual Status GetCurrentTime(int64_t* unix_time) = 0; + + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time) = 0; +}; + +// Wrapper class for a SystemClock. Redirects all methods (except Name) +// of the SystemClock interface to the target/wrapped class. +class SystemClockWrapper : public SystemClock { + public: + explicit SystemClockWrapper(const std::shared_ptr& t) + : target_(t) {} + + uint64_t NowMicros() override { return target_->NowMicros(); } + + uint64_t NowNanos() override { return target_->NowNanos(); } + + uint64_t CPUMicros() override { return target_->CPUMicros(); } + + uint64_t CPUNanos() override { return target_->CPUNanos(); } + + virtual void SleepForMicroseconds(int micros) override { + return target_->SleepForMicroseconds(micros); + } + + Status GetCurrentTime(int64_t* unix_time) override { + return target_->GetCurrentTime(unix_time); + } + + std::string TimeToString(uint64_t time) override { + return target_->TimeToString(time); + } + + protected: + std::shared_ptr target_; +}; + +} // end namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 8e0d144f886..13b31ee47b9 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -22,7 +22,7 @@ #include #include -#include "rocksdb/configurable.h" +#include "rocksdb/customizable.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/status.h" @@ -51,6 +51,55 @@ enum ChecksumType : char { kxxHash64 = 0x3, }; +// `PinningTier` is used to specify which tier of block-based tables should +// be affected by a block cache pinning setting (see +// `MetadataCacheOptions` below). +enum class PinningTier { + // For compatibility, this value specifies to fallback to the behavior + // indicated by the deprecated options, + // `pin_l0_filter_and_index_blocks_in_cache` and + // `pin_top_level_index_and_filter`. + kFallback, + + // This tier contains no block-based tables. + kNone, + + // This tier contains block-based tables that may have originated from a + // memtable flush. In particular, it includes tables from L0 that are smaller + // than 1.5 times the current `write_buffer_size`. Note these criteria imply + // it can include intra-L0 compaction outputs and ingested files, as long as + // they are not abnormally large compared to flushed files in L0. + kFlushedAndSimilar, + + // This tier contains all block-based tables. + kAll, +}; + +// `MetadataCacheOptions` contains members indicating the desired caching +// behavior for the different categories of metadata blocks. +struct MetadataCacheOptions { + // The tier of block-based tables whose top-level index into metadata + // partitions will be pinned. Currently indexes and filters may be + // partitioned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise any top-level index into metadata partitions would be + // held in table reader memory, outside the block cache. + PinningTier top_level_index_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose metadata partitions will be pinned. + // Currently indexes and filters may be partitioned. + PinningTier partition_pinning = PinningTier::kFallback; + + // The tier of block-based tables whose unpartitioned metadata blocks will be + // pinned. + // + // Note `cache_index_and_filter_blocks` must be true for this option to have + // any effect. Otherwise the unpartitioned meta-blocks would be held in table + // reader memory, outside the block cache. + PinningTier unpartitioned_pinning = PinningTier::kFallback; +}; + // For advanced user only struct BlockBasedTableOptions { static const char* kName() { return "BlockTableOptions"; }; @@ -79,12 +128,44 @@ struct BlockBasedTableOptions { // than data blocks. bool cache_index_and_filter_blocks_with_high_priority = true; + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating each of the following variables that + // has the default value, `PinningTier::kFallback`: + // + // - `MetadataCacheOptions::partition_pinning` + // - `MetadataCacheOptions::unpartitioned_pinning` + // + // The updated value is chosen as follows: + // + // - `pin_l0_filter_and_index_blocks_in_cache == false` -> + // `PinningTier::kNone` + // - `pin_l0_filter_and_index_blocks_in_cache == true` -> + // `PinningTier::kFlushedAndSimilar` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // // if cache_index_and_filter_blocks is true and the below is true, then // filter and index blocks are stored in the cache, but a reference is // held in the "table reader" object so the blocks are pinned and only // evicted from cache when the table reader is freed. bool pin_l0_filter_and_index_blocks_in_cache = false; + // DEPRECATED: This option will be removed in a future version. For now, this + // option still takes effect by updating + // `MetadataCacheOptions::top_level_index_pinning` when it has the + // default value, `PinningTier::kFallback`. + // + // The updated value is chosen as follows: + // + // - `pin_top_level_index_and_filter == false` -> + // `PinningTier::kNone` + // - `pin_top_level_index_and_filter == true` -> + // `PinningTier::kAll` + // + // To migrate away from this flag, explicitly configure + // `MetadataCacheOptions` as described above. + // // If cache_index_and_filter_blocks is true and the below is true, then // the top-level index of partitioned filter and index blocks are stored in // the cache, but a reference is held in the "table reader" object so the @@ -92,6 +173,12 @@ struct BlockBasedTableOptions { // freed. This is not limited to l0 in LSM tree. bool pin_top_level_index_and_filter = true; + // The desired block cache pinning behavior for the different categories of + // metadata blocks. While pinning can reduce block cache contention, users + // must take care not to pin excessive amounts of data, which risks + // overflowing block cache. + MetadataCacheOptions metadata_cache_options; + // The index type that will be used for this table. enum IndexType : char { // A space efficient index block that is optimized for @@ -201,13 +288,13 @@ struct BlockBasedTableOptions { // incompatible with block-based filters. bool partition_filters = false; - // EXPERIMENTAL Option to generate Bloom filters that minimize memory + // Option to generate Bloom/Ribbon filters that minimize memory // internal fragmentation. // // When false, malloc_usable_size is not available, or format_version < 5, // filters are generated without regard to internal fragmentation when // loaded into memory (historical behavior). When true (and - // malloc_usable_size is available and format_version >= 5), then Bloom + // malloc_usable_size is available and format_version >= 5), then // filters are generated to "round up" and "round down" their sizes to // minimize internal fragmentation when loaded into memory, assuming the // reading DB has the same memory allocation characteristics as the @@ -226,7 +313,8 @@ struct BlockBasedTableOptions { // NOTE: Because some memory counted by block cache might be unmapped pages // within internal fragmentation, this option can increase observed RSS // memory usage. With cache_index_and_filter_blocks=true, this option makes - // the block cache better at using space it is allowed. + // the block cache better at using space it is allowed. (These issues + // should not arise with partitioned filters.) // // NOTE: Do not set to true if you do not trust malloc_usable_size. With // this option, RocksDB might access an allocated memory object beyond its @@ -304,7 +392,7 @@ struct BlockBasedTableOptions { // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned // filters use a generally faster and more accurate Bloom filter // implementation, with a different schema. - uint32_t format_version = 4; + uint32_t format_version = 5; // Store index blocks on disk in compressed format. Changing this option to // false will avoid the overhead of decompression if index blocks are evicted @@ -348,6 +436,55 @@ struct BlockBasedTableOptions { IndexShorteningMode index_shortening = IndexShorteningMode::kShortenSeparators; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead + // starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size and max_auto_readahead_size can be configured. + // + // Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit + // auto prefetching will be done. If max_auto_readahead_size provided is less + // than 8KB (which is initial readahead size used by rocksdb in case of + // auto-readahead), readahead size will remain same as + // max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch + // the blocks. + // + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{max_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 256 KB (256 * 1024). + size_t max_auto_readahead_size = 256 * 1024; + + // If enabled, prepopulate warm/hot data blocks which are already in memory + // into block cache at the time of flush. On a flush, the data block that is + // in memory (in memtables) get flushed to the device. If using Direct IO, + // additional IO is incurred to read this data back into memory again, which + // is avoided by enabling this option. This further helps if the workload + // exhibits high temporal locality, where most of the reads go to recently + // written data. This also helps in case of Distributed FileSystem. + // + // Right now, this is enabled only for flush for data blocks. We plan to + // expand this option to cover compactions in the future and for other types + // of blocks. + enum class PrepopulateBlockCache : char { + // Disable prepopulate block cache. + kDisable, + // Prepopulate data blocks during flush only. Plan to extend it to all block + // types. + kFlushOnly, + }; + + PrepopulateBlockCache prepopulate_block_cache = + PrepopulateBlockCache::kDisable; }; // Table Properties that are specific to block-based table properties. @@ -441,7 +578,7 @@ struct PlainTableOptions { // @store_index_in_file: compute plain table index and bloom filter during // file building and store it in file. When reading - // file, index will be mmaped instead of recomputation. + // file, index will be mapped instead of recomputation. bool store_index_in_file = false; }; @@ -526,7 +663,7 @@ extern TableFactory* NewCuckooTableFactory( class RandomAccessFileReader; // A base class for table factories. -class TableFactory : public Configurable { +class TableFactory : public Customizable { public: virtual ~TableFactory() override {} @@ -540,21 +677,7 @@ class TableFactory : public Configurable { const std::string& id, std::shared_ptr* factory); - // The type of the table. - // - // The client of this package should switch to a new name whenever - // the table format implementation changes. - // - // Names starting with "rocksdb." are reserved and should not be used - // by any clients of this package. - virtual const char* Name() const = 0; - - // Returns true if the class is an instance of the input name. - // This is typically determined by if the input name matches the - // name of this object. - virtual bool IsInstanceOf(const std::string& name) const { - return name == Name(); - } + static const char* Type() { return "TableFactory"; } // Returns a Table object table that can fetch data from file specified // in parameter file. It's the caller's responsibility to make sure @@ -611,7 +734,7 @@ class TableFactory : public Configurable { // to use in this table. virtual TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const = 0; + WritableFileWriter* file) const = 0; // Return is delete range supported virtual bool IsDeleteRangeSupported() const { return false; } diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index ba3eca752ef..d3e9eeace3e 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -32,6 +32,7 @@ typedef std::map UserCollectedProperties; struct TablePropertiesNames { static const std::string kDbId; static const std::string kDbSessionId; + static const std::string kDbHostId; static const std::string kDataSize; static const std::string kIndexSize; static const std::string kIndexPartitions; @@ -43,6 +44,7 @@ struct TablePropertiesNames { static const std::string kRawValueSize; static const std::string kNumDataBlocks; static const std::string kNumEntries; + static const std::string kNumFilterEntries; static const std::string kDeletedKeys; static const std::string kMergeOperands; static const std::string kNumRangeDeletions; @@ -60,6 +62,8 @@ struct TablePropertiesNames { static const std::string kCreationTime; static const std::string kOldestKeyTime; static const std::string kFileCreationTime; + static const std::string kSlowCompressionEstimatedDataSize; + static const std::string kFastCompressionEstimatedDataSize; }; extern const std::string kPropertiesBlock; @@ -98,9 +102,9 @@ class TablePropertiesCollector { } // Called after each new block is cut - virtual void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) { + virtual void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) { // Nothing to do here. Callback registers can override. return; } @@ -172,6 +176,8 @@ struct TableProperties { uint64_t num_data_blocks = 0; // the number of entries in this table uint64_t num_entries = 0; + // the number of unique entries (keys or prefixes) added to filters + uint64_t num_filter_entries = 0; // the number of deletions in the table uint64_t num_deletions = 0; // the number of merge operands in the table @@ -194,6 +200,14 @@ struct TableProperties { uint64_t oldest_key_time = 0; // Actual SST file creation time. 0 means unknown. uint64_t file_creation_time = 0; + // Estimated size of data blocks if compressed using a relatively slower + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t slow_compression_estimated_data_size = 0; + // Estimated size of data blocks if compressed using a relatively faster + // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`). + // 0 means unknown. + uint64_t fast_compression_estimated_data_size = 0; // DB identity // db_id is an identifier generated the first time the DB is created @@ -206,6 +220,12 @@ struct TableProperties { // empty string. std::string db_session_id; + // Location of the machine hosting the DB instance + // db_host_id identifies the location of the host in some form + // (hostname by default, but can also be any string of the user's choosing). + // It can potentially change whenever the DB is opened + std::string db_host_id; + // Name of the column family with which this SST file is associated. // If column family is unknown, `column_family_name` will be an empty string. std::string column_family_name; @@ -251,6 +271,11 @@ struct TableProperties { // Aggregate the numerical member variables of the specified // TableProperties. void Add(const TableProperties& tp); + + // Subset of properties that make sense when added together + // between tables. Keys match field names in this class instead + // of using full property names. + std::map GetAggregatablePropertiesAsMap() const; }; // Extra properties diff --git a/include/rocksdb/trace_reader_writer.h b/include/rocksdb/trace_reader_writer.h index d58ed47b2a0..26ceab2c847 100644 --- a/include/rocksdb/trace_reader_writer.h +++ b/include/rocksdb/trace_reader_writer.h @@ -28,7 +28,7 @@ class TraceWriter { }; // TraceReader allows reading RocksDB traces from any system, one operation at -// a time. A RocksDB Replayer could depend on this to replay opertions. +// a time. A RocksDB Replayer could depend on this to replay operations. class TraceReader { public: TraceReader() {} diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index a4ab9c07a0b..f495fcd22c1 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -12,11 +12,36 @@ namespace ROCKSDB_NAMESPACE { // Define all public custom types here. +using ColumnFamilyId = uint32_t; + // Represents a sequence number in a WAL file. typedef uint64_t SequenceNumber; const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, + kMisc, +}; + +// The types of files RocksDB uses in a DB directory. (Available for +// advanced options.) +enum FileType { + kWalFile, + kDBLockFile, + kTableFile, + kDescriptorFile, + kCurrentFile, + kTempFile, + kInfoLogFile, // Either the current one, or an old one + kMetaDatabase, + kIdentityFile, + kOptionsFile, + kBlobFile +}; + // User-oriented representation of internal key types. // Ordering of this enum entries should not change. enum EntryType { diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h index e3aeee6ceec..f4df5c0009c 100644 --- a/include/rocksdb/universal_compaction.h +++ b/include/rocksdb/universal_compaction.h @@ -36,12 +36,12 @@ class CompactionOptionsUniversal { // The size amplification is defined as the amount (in percentage) of // additional storage needed to store a single byte of data in the database. // For example, a size amplification of 2% means that a database that - // contains 100 bytes of user-data may occupy upto 102 bytes of + // contains 100 bytes of user-data may occupy up to 102 bytes of // physical storage. By this definition, a fully compacted database has // a size amplification of 0%. Rocksdb uses the following heuristic // to calculate size amplification: it assumes that all files excluding // the earliest file contribute to the size amplification. - // Default: 200, which means that a 100 byte database could require upto + // Default: 200, which means that a 100 byte database could require up to // 300 bytes of storage. unsigned int max_size_amplification_percent; diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h new file mode 100644 index 00000000000..d6a7764e635 --- /dev/null +++ b/include/rocksdb/utilities/backup_engine.h @@ -0,0 +1,606 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +// The default DB file checksum function name. +constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c"; +// The default BackupEngine file checksum function name. +constexpr char kBackupFileChecksumFuncName[] = "crc32c"; + +struct BackupEngineOptions { + // Where to keep the backup files. Has to be different than dbname_ + // Best to set this to dbname_ + "/backups" + // Required + std::string backup_dir; + + // Backup Env object. It will be used for backup file I/O. If it's + // nullptr, backups will be written out using DBs Env. If it's + // non-nullptr, backup's I/O will be performed using this object. + // If you want to have backups on HDFS, use HDFS Env here! + // Default: nullptr + Env* backup_env; + + // share_table_files supports table and blob files. + // + // If share_table_files == true, the backup directory will share table and + // blob files among backups, to save space among backups of the same DB and to + // enable incremental backups by only copying new files. + // If share_table_files == false, each backup will be on its own and will not + // share any data with other backups. + // + // default: true + bool share_table_files; + + // Backup info and error messages will be written to info_log + // if non-nullptr. + // Default: nullptr + Logger* info_log; + + // If sync == true, we can guarantee you'll get consistent backup even + // on a machine crash/reboot. Backup process is slower with sync enabled. + // If sync == false, we don't guarantee anything on machine reboot. However, + // chances are some of the backups are consistent. + // Default: true + bool sync; + + // If true, it will delete whatever backups there are already + // Default: false + bool destroy_old_data; + + // If false, we won't backup log files. This option can be useful for backing + // up in-memory databases where log file are persisted, but table files are in + // memory. + // Default: true + bool backup_log_files; + + // Max bytes that can be transferred in a second during backup. + // If 0, go as fast as you can + // Default: 0 + uint64_t backup_rate_limit; + + // Backup rate limiter. Used to control transfer speed for backup. If this is + // not null, backup_rate_limit is ignored. + // Default: nullptr + std::shared_ptr backup_rate_limiter{nullptr}; + + // Max bytes that can be transferred in a second during restore. + // If 0, go as fast as you can + // Default: 0 + uint64_t restore_rate_limit; + + // Restore rate limiter. Used to control transfer speed during restore. If + // this is not null, restore_rate_limit is ignored. + // Default: nullptr + std::shared_ptr restore_rate_limiter{nullptr}; + + // share_files_with_checksum supports table and blob files. + // + // Only used if share_table_files is set to true. Setting to false is + // DEPRECATED and potentially dangerous because in that case BackupEngine + // can lose data if backing up databases with distinct or divergent + // history, for example if restoring from a backup other than the latest, + // writing to the DB, and creating another backup. Setting to true (default) + // prevents these issues by ensuring that different table files (SSTs) and + // blob files with the same number are treated as distinct. See + // share_files_with_checksum_naming and ShareFilesNaming. + // + // Default: true + bool share_files_with_checksum; + + // Up to this many background threads will copy files for CreateNewBackup() + // and RestoreDBFromBackup() + // Default: 1 + int max_background_operations; + + // During backup user can get callback every time next + // callback_trigger_interval_size bytes being copied. + // Default: 4194304 + uint64_t callback_trigger_interval_size; + + // For BackupEngineReadOnly, Open() will open at most this many of the + // latest non-corrupted backups. + // + // Note: this setting is ignored (behaves like INT_MAX) for any kind of + // writable BackupEngine because it would inhibit accounting for shared + // files for proper backup deletion, including purging any incompletely + // created backups on creation of a new backup. + // + // Default: INT_MAX + int max_valid_backups_to_open; + + // ShareFilesNaming describes possible naming schemes for backup + // table and blob file names when they are stored in the + // shared_checksum directory (i.e., both share_table_files and + // share_files_with_checksum are true). + enum ShareFilesNaming : uint32_t { + // Backup blob filenames are __.blob and + // backup SST filenames are __.sst + // where is an unsigned decimal integer. This is the + // original/legacy naming scheme for share_files_with_checksum, + // with two problems: + // * At massive scale, collisions on this triple with different file + // contents is plausible. + // * Determining the name to use requires computing the checksum, + // so generally requires reading the whole file even if the file + // is already backed up. + // + // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR ** + kLegacyCrc32cAndFileSize = 1U, + + // Backup SST filenames are _s.sst. This + // pair of values should be very strongly unique for a given SST file + // and easily determined before computing a checksum. The 's' indicates + // the value is a DB session id, not a checksum. + // + // Exceptions: + // * For blob files, kLegacyCrc32cAndFileSize is used as currently + // db_session_id is not supported by the blob file format. + // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize + // will be used instead, matching the names assigned by RocksDB versions + // not supporting the newer naming scheme. + // * See also flags below. + kUseDbSessionId = 2U, + + kMaskNoNamingFlags = 0xffffU, + + // If not already part of the naming scheme, insert + // _ + // before .sst and .blob in the name. In case of user code actually parsing + // the last _ before the .sst and .blob as the file size, this + // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this + // option makes official that unofficial feature of the backup metadata. + // + // We do not consider SST and blob file sizes to have sufficient entropy to + // contribute significantly to naming uniqueness. + kFlagIncludeFileSize = 1U << 31, + + kMaskNamingFlags = ~kMaskNoNamingFlags, + }; + + // Naming option for share_files_with_checksum table and blob files. See + // ShareFilesNaming for details. + // + // Modifying this option cannot introduce a downgrade compatibility issue + // because RocksDB can read, restore, and delete backups using different file + // names, and it's OK for a backup directory to use a mixture of table and + // blob files naming schemes. + // + // However, modifying this option and saving more backups to the same + // directory can lead to the same file getting saved again to that + // directory, under the new shared name in addition to the old shared + // name. + // + // Default: kUseDbSessionId | kFlagIncludeFileSize + // + // Note: This option comes into effect only if both share_files_with_checksum + // and share_table_files are true. + ShareFilesNaming share_files_with_checksum_naming; + + void Dump(Logger* logger) const; + + explicit BackupEngineOptions( + const std::string& _backup_dir, Env* _backup_env = nullptr, + bool _share_table_files = true, Logger* _info_log = nullptr, + bool _sync = true, bool _destroy_old_data = false, + bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, + uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, + uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, + int _max_valid_backups_to_open = INT_MAX, + ShareFilesNaming _share_files_with_checksum_naming = + static_cast(kUseDbSessionId | kFlagIncludeFileSize)) + : backup_dir(_backup_dir), + backup_env(_backup_env), + share_table_files(_share_table_files), + info_log(_info_log), + sync(_sync), + destroy_old_data(_destroy_old_data), + backup_log_files(_backup_log_files), + backup_rate_limit(_backup_rate_limit), + restore_rate_limit(_restore_rate_limit), + share_files_with_checksum(true), + max_background_operations(_max_background_operations), + callback_trigger_interval_size(_callback_trigger_interval_size), + max_valid_backups_to_open(_max_valid_backups_to_open), + share_files_with_checksum_naming(_share_files_with_checksum_naming) { + assert(share_table_files || !share_files_with_checksum); + assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0); + } +}; + +inline BackupEngineOptions::ShareFilesNaming operator&( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert(r == BackupEngineOptions::kMaskNoNamingFlags || + (r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l & r); +} + +inline BackupEngineOptions::ShareFilesNaming operator|( + BackupEngineOptions::ShareFilesNaming lhs, + BackupEngineOptions::ShareFilesNaming rhs) { + uint32_t l = static_cast(lhs); + uint32_t r = static_cast(rhs); + assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0); + return static_cast(l | r); +} + +struct CreateBackupOptions { + // Flush will always trigger if 2PC is enabled. + // If write-ahead logs are disabled, set flush_before_backup=true to + // avoid losing unflushed key/value pairs from the memtable. + bool flush_before_backup = false; + + // Callback for reporting progress, based on callback_trigger_interval_size. + std::function progress_callback = []() {}; + + // If false, background_thread_cpu_priority is ignored. + // Otherwise, the cpu priority can be decreased, + // if you try to increase the priority, the priority will not change. + // The initial priority of the threads is CpuPriority::kNormal, + // so you can decrease to priorities lower than kNormal. + bool decrease_background_thread_cpu_priority = false; + CpuPriority background_thread_cpu_priority = CpuPriority::kNormal; +}; + +struct RestoreOptions { + // If true, restore won't overwrite the existing log files in wal_dir. It will + // also move all log files from archive directory to wal_dir. Use this option + // in combination with BackupEngineOptions::backup_log_files = false for + // persisting in-memory databases. + // Default: false + bool keep_log_files; + + explicit RestoreOptions(bool _keep_log_files = false) + : keep_log_files(_keep_log_files) {} +}; + +struct BackupFileInfo { + // File name and path relative to the backup_dir directory. + std::string relative_filename; + + // Size of the file in bytes, not including filesystem overheads. + uint64_t size; +}; + +typedef uint32_t BackupID; + +struct BackupInfo { + BackupID backup_id = 0U; + // Creation time, according to GetCurrentTime + int64_t timestamp = 0; + + // Total size in bytes (based on file payloads, not including filesystem + // overheads or backup meta file) + uint64_t size = 0U; + + // Number of backed up files, some of which might be shared with other + // backups. Does not include backup meta file. + uint32_t number_files = 0U; + + // Backup API user metadata + std::string app_metadata; + + // Backup file details, if requested with include_file_details=true + std::vector file_details; + + // DB "name" (a directory in the backup_env) for opening this backup as a + // read-only DB. This should also be used as the DBOptions::wal_dir, such + // as by default setting wal_dir="". See also env_for_open. + // This field is only set if include_file_details=true + std::string name_for_open; + + // An Env(+FileSystem) for opening this backup as a read-only DB, with + // DB::OpenForReadOnly or similar. This field is only set if + // include_file_details=true. (The FileSystem in this Env takes care + // of making shared backup files openable from the `name_for_open` DB + // directory.) See also name_for_open. + // + // This Env might or might not be shared with other backups. To work + // around DBOptions::env being a raw pointer, this is a shared_ptr so + // that keeping either this BackupInfo, the BackupEngine, or a copy of + // this shared_ptr alive is sufficient to keep the Env alive for use by + // a read-only DB. + std::shared_ptr env_for_open; + + BackupInfo() {} + + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files, const std::string& _app_metadata) + : backup_id(_backup_id), + timestamp(_timestamp), + size(_size), + number_files(_number_files), + app_metadata(_app_metadata) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; +}; + +// Read-only functions of a BackupEngine. (Restore writes to another directory +// not the backup directory.) See BackupEngine comments for details on +// safe concurrent operations. +class BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnlyBase() {} + + // Returns info about the latest good backup in backup_info, or NotFound + // no good backup exists. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetLatestBackupInfo( + BackupInfo* backup_info, bool include_file_details = false) const = 0; + + // Returns info about a specific backup in backup_info, or NotFound + // or Corruption status if the requested backup id does not exist or is + // known corrupt. + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info, + bool include_file_details = false) const = 0; + + // Returns info about backups in backup_info + // Setting include_file_details=true provides information about each + // backed-up file in BackupInfo::file_details and more. + virtual void GetBackupInfo(std::vector* backup_info, + bool include_file_details = false) const = 0; + + // Returns info about corrupt backups in corrupt_backups. + // WARNING: Any write to the BackupEngine could trigger automatic + // GarbageCollect(), which could delete files that would be needed to + // manually recover a corrupt backup or to preserve an unrecognized (e.g. + // incompatible future version) backup. + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) const = 0; + + // Restore to specified db_dir and wal_dir from backup_id. + virtual Status RestoreDBFromBackup(const RestoreOptions& options, + BackupID backup_id, + const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual Status RestoreDBFromBackup( + BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); + } + + // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id + virtual Status RestoreDBFromLatestBackup( + const RestoreOptions& options, const std::string& db_dir, + const std::string& wal_dir) const = 0; + + // keep for backward compatibility. + virtual Status RestoreDBFromLatestBackup( + const std::string& db_dir, const std::string& wal_dir, + const RestoreOptions& options = RestoreOptions()) const { + return RestoreDBFromLatestBackup(options, db_dir, wal_dir); + } + + // If verify_with_checksum is true, this function + // inspects the current checksums and file sizes of backup files to see if + // they match our expectation. + // + // If verify_with_checksum is false, this function + // checks that each file exists and that the size of the file matches our + // expectation. It does not check file checksum. + // + // If this BackupEngine created the backup, it compares the files' current + // sizes (and current checksum) against the number of bytes written to + // them (and the checksum calculated) during creation. + // Otherwise, it compares the files' current sizes (and checksums) against + // their sizes (and checksums) when the BackupEngine was opened. + // + // Returns Status::OK() if all checks are good + virtual Status VerifyBackup(BackupID backup_id, + bool verify_with_checksum = false) const = 0; +}; + +// Append-only functions of a BackupEngine. See BackupEngine comment for +// details on distinction between Append and Write operations and safe +// concurrent operations. +class BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngineAppendOnlyBase() {} + + // same as CreateNewBackup, but stores extra application metadata. + virtual Status CreateNewBackupWithMetadata( + const CreateBackupOptions& options, DB* db, + const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0; + + // keep here for backward compatibility. + virtual Status CreateNewBackupWithMetadata( + DB* db, const std::string& app_metadata, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackupWithMetadata(options, db, app_metadata); + } + + // Captures the state of the database by creating a new (latest) backup. + // On success (OK status), the BackupID of the new backup is saved to + // *new_backup_id when not nullptr. + virtual Status CreateNewBackup(const CreateBackupOptions& options, DB* db, + BackupID* new_backup_id = nullptr) { + return CreateNewBackupWithMetadata(options, db, "", new_backup_id); + } + + // keep here for backward compatibility. + virtual Status CreateNewBackup( + DB* db, bool flush_before_backup = false, + std::function progress_callback = []() {}) { + CreateBackupOptions options; + options.flush_before_backup = flush_before_backup; + options.progress_callback = progress_callback; + return CreateNewBackup(options, db); + } + + // Call this from another thread if you want to stop the backup + // that is currently happening. It will return immediately, will + // not wait for the backup to stop. + // The backup will stop ASAP and the call to CreateNewBackup will + // return Status::Incomplete(). It will not clean up after itself, but + // the state will remain consistent. The state will be cleaned up the + // next time you call CreateNewBackup or GarbageCollect. + virtual void StopBackup() = 0; + + // Will delete any files left over from incomplete creation or deletion of + // a backup. This is not normally needed as those operations also clean up + // after prior incomplete calls to the same kind of operation (create or + // delete). This does not delete corrupt backups but can delete files that + // would be needed to manually recover a corrupt backup or to preserve an + // unrecognized (e.g. incompatible future version) backup. + // NOTE: This is not designed to delete arbitrary files added to the backup + // directory outside of BackupEngine, and clean-up is always subject to + // permissions on and availability of the underlying filesystem. + // NOTE2: For concurrency and interference purposes (see BackupEngine + // comment), GarbageCollect (GC) is like other Append operations, even + // though it seems different. Although GC can delete physical data, it does + // not delete any logical data read by Read operations. GC can interfere + // with Append or Write operations in another BackupEngine on the same + // backup_dir, because temporary files will be treated as obsolete and + // deleted. + virtual Status GarbageCollect() = 0; +}; + +// A backup engine for organizing and managing backups. +// This class is not user-extensible. +// +// This class declaration adds "Write" operations in addition to the +// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase. +// +// # Concurrency between threads on the same BackupEngine* object +// +// As of version 6.20, BackupEngine* operations are generally thread-safe, +// using a read-write lock, though single-thread operation is still +// recommended to avoid TOCTOU bugs. Specifically, particular kinds of +// concurrent operations behave like this: +// +// op1\op2| Read | Append | Write +// -------|-------|--------|-------- +// Read | conc | block | block +// Append | block | block | block +// Write | block | block | block +// +// conc = operations safely proceed concurrently +// block = one of the operations safely blocks until the other completes. +// There is generally no guarantee as to which completes first. +// +// StopBackup is the only operation that affects an ongoing operation. +// +// # Interleaving operations between BackupEngine* objects open on the +// same backup_dir +// +// It is recommended only to have one BackupEngine* object open for a given +// backup_dir, but it is possible to mix / interleave some operations +// (regardless of whether they are concurrent) with these caveats: +// +// op1\op2| Open | Read | Append | Write +// -------|--------|--------|--------|-------- +// Open | conc | conc | atomic | unspec +// Read | conc | conc | old | unspec +// Append | atomic | old | unspec | unspec +// Write | unspec | unspec | unspec | unspec +// +// Special case: Open with destroy_old_data=true is really a Write +// +// conc = operations safely proceed, concurrently when applicable +// atomic = operations are effectively atomic; if a concurrent Append +// operation has not completed at some key point during Open, the +// opened BackupEngine* will never see the result of the Append op. +// old = Read operations do not include any state changes from other +// BackupEngine* objects; they return the state at their Open time. +// unspec = Behavior is unspecified, including possibly trashing the +// backup_dir, but is "memory safe" (no C++ undefined behavior) +// +class BackupEngine : public BackupEngineReadOnlyBase, + public BackupEngineAppendOnlyBase { + public: + virtual ~BackupEngine() {} + + // BackupEngineOptions have to be the same as the ones used in previous + // BackupEngines for the same backup directory. + static Status Open(const BackupEngineOptions& options, Env* db_env, + BackupEngine** backup_engine_ptr); + + // keep for backward compatibility. + static Status Open(Env* db_env, const BackupEngineOptions& options, + BackupEngine** backup_engine_ptr) { + return BackupEngine::Open(options, db_env, backup_engine_ptr); + } + + // Deletes old backups, keeping latest num_backups_to_keep alive. + // See also DeleteBackup. + virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; + + // Deletes a specific backup. If this operation (or PurgeOldBackups) + // is not completed due to crash, power failure, etc. the state + // will be cleaned up the next time you call DeleteBackup, + // PurgeOldBackups, or GarbageCollect. + virtual Status DeleteBackup(BackupID backup_id) = 0; +}; + +// A variant of BackupEngine that only allows "Read" operations. See +// BackupEngine comment for details. This class is not user-extensible. +class BackupEngineReadOnly : public BackupEngineReadOnlyBase { + public: + virtual ~BackupEngineReadOnly() {} + + static Status Open(const BackupEngineOptions& options, Env* db_env, + BackupEngineReadOnly** backup_engine_ptr); + // keep for backward compatibility. + static Status Open(Env* db_env, const BackupEngineOptions& options, + BackupEngineReadOnly** backup_engine_ptr) { + return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr); + } +}; + +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 36f29edbb2f..de040b5521c 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -1,535 +1,26 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This is a DEPRECATED header for API backward compatibility. Please +// use backup_engine.h. #pragma once #ifndef ROCKSDB_LITE +// A legacy unnecessary include #include -#include -#include -#include -#include -#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/backup_engine.h" -#include "rocksdb/env.h" -#include "rocksdb/options.h" -#include "rocksdb/status.h" +// A legacy unnecessary include +#include "rocksdb/utilities/stackable_db.h" namespace ROCKSDB_NAMESPACE { -// The default DB file checksum function name. -constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c"; -// The default BackupEngine file checksum function name. -constexpr char kBackupFileChecksumFuncName[] = "crc32c"; - -struct BackupableDBOptions { - // Where to keep the backup files. Has to be different than dbname_ - // Best to set this to dbname_ + "/backups" - // Required - std::string backup_dir; - - // Backup Env object. It will be used for backup file I/O. If it's - // nullptr, backups will be written out using DBs Env. If it's - // non-nullptr, backup's I/O will be performed using this object. - // If you want to have backups on HDFS, use HDFS Env here! - // Default: nullptr - Env* backup_env; - - // If share_table_files == true, backup will assume that table files with - // same name have the same contents. This enables incremental backups and - // avoids unnecessary data copies. - // If share_table_files == false, each backup will be on its own and will - // not share any data with other backups. - // default: true - bool share_table_files; - - // Backup info and error messages will be written to info_log - // if non-nullptr. - // Default: nullptr - Logger* info_log; - - // If sync == true, we can guarantee you'll get consistent backup even - // on a machine crash/reboot. Backup process is slower with sync enabled. - // If sync == false, we don't guarantee anything on machine reboot. However, - // chances are some of the backups are consistent. - // Default: true - bool sync; - - // If true, it will delete whatever backups there are already - // Default: false - bool destroy_old_data; - - // If false, we won't backup log files. This option can be useful for backing - // up in-memory databases where log file are persisted, but table files are in - // memory. - // Default: true - bool backup_log_files; - - // Max bytes that can be transferred in a second during backup. - // If 0, go as fast as you can - // Default: 0 - uint64_t backup_rate_limit; - - // Backup rate limiter. Used to control transfer speed for backup. If this is - // not null, backup_rate_limit is ignored. - // Default: nullptr - std::shared_ptr backup_rate_limiter{nullptr}; - - // Max bytes that can be transferred in a second during restore. - // If 0, go as fast as you can - // Default: 0 - uint64_t restore_rate_limit; - - // Restore rate limiter. Used to control transfer speed during restore. If - // this is not null, restore_rate_limit is ignored. - // Default: nullptr - std::shared_ptr restore_rate_limiter{nullptr}; - - // Only used if share_table_files is set to true. If true, will consider - // that backups can come from different databases, even differently mutated - // databases with the same DB ID. See share_files_with_checksum_naming and - // ShareFilesNaming for details on how table files names are made - // unique between databases. - // - // Using 'true' is fundamentally safer, and performance improvements vs. - // original design should leave almost no reason to use the 'false' setting. - // - // Default (only for historical reasons): false - bool share_files_with_checksum; - - // Up to this many background threads will copy files for CreateNewBackup() - // and RestoreDBFromBackup() - // Default: 1 - int max_background_operations; - - // During backup user can get callback every time next - // callback_trigger_interval_size bytes being copied. - // Default: 4194304 - uint64_t callback_trigger_interval_size; - - // For BackupEngineReadOnly, Open() will open at most this many of the - // latest non-corrupted backups. - // - // Note: this setting is ignored (behaves like INT_MAX) for any kind of - // writable BackupEngine because it would inhibit accounting for shared - // files for proper backup deletion, including purging any incompletely - // created backups on creation of a new backup. - // - // Default: INT_MAX - int max_valid_backups_to_open; - - // ShareFilesNaming describes possible naming schemes for backup - // table file names when the table files are stored in the shared_checksum - // directory (i.e., both share_table_files and share_files_with_checksum - // are true). - enum ShareFilesNaming : uint32_t { - // Backup SST filenames are __.sst - // where is an unsigned decimal integer. This is the - // original/legacy naming scheme for share_files_with_checksum, - // with two problems: - // * At massive scale, collisions on this triple with different file - // contents is plausible. - // * Determining the name to use requires computing the checksum, - // so generally requires reading the whole file even if the file - // is already backed up. - // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR ** - kLegacyCrc32cAndFileSize = 1U, - - // Backup SST filenames are _s.sst. This - // pair of values should be very strongly unique for a given SST file - // and easily determined before computing a checksum. The 's' indicates - // the value is a DB session id, not a checksum. - // - // Exceptions: - // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize - // will be used instead, matching the names assigned by RocksDB versions - // not supporting the newer naming scheme. - // * See also flags below. - kUseDbSessionId = 2U, - - kMaskNoNamingFlags = 0xffffU, - - // If not already part of the naming scheme, insert - // _ - // before .sst in the name. In case of user code actually parsing the - // last _ before the .sst as the file size, this preserves that - // feature of kLegacyCrc32cAndFileSize. In other words, this option makes - // official that unofficial feature of the backup metadata. - // - // We do not consider SST file sizes to have sufficient entropy to - // contribute significantly to naming uniqueness. - kFlagIncludeFileSize = 1U << 31, - - // When encountering an SST file from a Facebook-internal early - // release of 6.12, use the default naming scheme in effect for - // when the SST file was generated (assuming full file checksum - // was not set to GetFileChecksumGenCrc32cFactory()). That naming is - // _.sst - // and ignores kFlagIncludeFileSize setting. - // NOTE: This flag is intended to be temporary and should be removed - // in a later release. - kFlagMatchInterimNaming = 1U << 30, - - kMaskNamingFlags = ~kMaskNoNamingFlags, - }; - - // Naming option for share_files_with_checksum table files. See - // ShareFilesNaming for details. - // - // Modifying this option cannot introduce a downgrade compatibility issue - // because RocksDB can read, restore, and delete backups using different file - // names, and it's OK for a backup directory to use a mixture of table file - // naming schemes. - // - // However, modifying this option and saving more backups to the same - // directory can lead to the same file getting saved again to that - // directory, under the new shared name in addition to the old shared - // name. - // - // Default: kUseDbSessionId | kFlagIncludeFileSize | kFlagMatchInterimNaming - // - // Note: This option comes into effect only if both share_files_with_checksum - // and share_table_files are true. - ShareFilesNaming share_files_with_checksum_naming; - - void Dump(Logger* logger) const; - - explicit BackupableDBOptions( - const std::string& _backup_dir, Env* _backup_env = nullptr, - bool _share_table_files = true, Logger* _info_log = nullptr, - bool _sync = true, bool _destroy_old_data = false, - bool _backup_log_files = true, uint64_t _backup_rate_limit = 0, - uint64_t _restore_rate_limit = 0, int _max_background_operations = 1, - uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024, - int _max_valid_backups_to_open = INT_MAX, - ShareFilesNaming _share_files_with_checksum_naming = - static_cast(kUseDbSessionId | kFlagIncludeFileSize | - kFlagMatchInterimNaming)) - : backup_dir(_backup_dir), - backup_env(_backup_env), - share_table_files(_share_table_files), - info_log(_info_log), - sync(_sync), - destroy_old_data(_destroy_old_data), - backup_log_files(_backup_log_files), - backup_rate_limit(_backup_rate_limit), - restore_rate_limit(_restore_rate_limit), - share_files_with_checksum(false), - max_background_operations(_max_background_operations), - callback_trigger_interval_size(_callback_trigger_interval_size), - max_valid_backups_to_open(_max_valid_backups_to_open), - share_files_with_checksum_naming(_share_files_with_checksum_naming) { - assert(share_table_files || !share_files_with_checksum); - assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0); - } -}; - -inline BackupableDBOptions::ShareFilesNaming operator&( - BackupableDBOptions::ShareFilesNaming lhs, - BackupableDBOptions::ShareFilesNaming rhs) { - uint32_t l = static_cast(lhs); - uint32_t r = static_cast(rhs); - assert(r == BackupableDBOptions::kMaskNoNamingFlags || - (r & BackupableDBOptions::kMaskNoNamingFlags) == 0); - return static_cast(l & r); -} - -inline BackupableDBOptions::ShareFilesNaming operator|( - BackupableDBOptions::ShareFilesNaming lhs, - BackupableDBOptions::ShareFilesNaming rhs) { - uint32_t l = static_cast(lhs); - uint32_t r = static_cast(rhs); - assert((r & BackupableDBOptions::kMaskNoNamingFlags) == 0); - return static_cast(l | r); -} - -struct CreateBackupOptions { - // Flush will always trigger if 2PC is enabled. - // If write-ahead logs are disabled, set flush_before_backup=true to - // avoid losing unflushed key/value pairs from the memtable. - bool flush_before_backup = false; - - // Callback for reporting progress, based on callback_trigger_interval_size. - std::function progress_callback = []() {}; - - // If false, background_thread_cpu_priority is ignored. - // Otherwise, the cpu priority can be decreased, - // if you try to increase the priority, the priority will not change. - // The initial priority of the threads is CpuPriority::kNormal, - // so you can decrease to priorities lower than kNormal. - bool decrease_background_thread_cpu_priority = false; - CpuPriority background_thread_cpu_priority = CpuPriority::kNormal; -}; - -struct RestoreOptions { - // If true, restore won't overwrite the existing log files in wal_dir. It will - // also move all log files from archive directory to wal_dir. Use this option - // in combination with BackupableDBOptions::backup_log_files = false for - // persisting in-memory databases. - // Default: false - bool keep_log_files; - - explicit RestoreOptions(bool _keep_log_files = false) - : keep_log_files(_keep_log_files) {} -}; - -typedef uint32_t BackupID; - -struct BackupInfo { - BackupID backup_id; - int64_t timestamp; - uint64_t size; - - uint32_t number_files; - std::string app_metadata; - - BackupInfo() {} - - BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, - uint32_t _number_files, const std::string& _app_metadata) - : backup_id(_backup_id), - timestamp(_timestamp), - size(_size), - number_files(_number_files), - app_metadata(_app_metadata) {} -}; - -class BackupStatistics { - public: - BackupStatistics() { - number_success_backup = 0; - number_fail_backup = 0; - } - - BackupStatistics(uint32_t _number_success_backup, - uint32_t _number_fail_backup) - : number_success_backup(_number_success_backup), - number_fail_backup(_number_fail_backup) {} - - ~BackupStatistics() {} - - void IncrementNumberSuccessBackup(); - void IncrementNumberFailBackup(); - - uint32_t GetNumberSuccessBackup() const; - uint32_t GetNumberFailBackup() const; - - std::string ToString() const; - - private: - uint32_t number_success_backup; - uint32_t number_fail_backup; -}; - -// A backup engine for accessing information about backups and restoring from -// them. -// BackupEngineReadOnly is not extensible. -class BackupEngineReadOnly { - public: - virtual ~BackupEngineReadOnly() {} - - static Status Open(const BackupableDBOptions& options, Env* db_env, - BackupEngineReadOnly** backup_engine_ptr); - // keep for backward compatibility. - static Status Open(Env* db_env, const BackupableDBOptions& options, - BackupEngineReadOnly** backup_engine_ptr) { - return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr); - } - - // Returns info about backups in backup_info - // You can GetBackupInfo safely, even with other BackupEngine performing - // backups on the same directory - virtual void GetBackupInfo(std::vector* backup_info) = 0; - - // Returns info about corrupt backups in corrupt_backups - virtual void GetCorruptedBackups( - std::vector* corrupt_backup_ids) = 0; - - // Restoring DB from backup is NOT safe when there is another BackupEngine - // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's - // responsibility to synchronize the operation, i.e. don't delete the backup - // when you're restoring from it - // See also the corresponding doc in BackupEngine - virtual Status RestoreDBFromBackup(const RestoreOptions& options, - BackupID backup_id, - const std::string& db_dir, - const std::string& wal_dir) = 0; - - // keep for backward compatibility. - virtual Status RestoreDBFromBackup( - BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& options = RestoreOptions()) { - return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); - } - - // See the corresponding doc in BackupEngine - virtual Status RestoreDBFromLatestBackup(const RestoreOptions& options, - const std::string& db_dir, - const std::string& wal_dir) = 0; - - // keep for backward compatibility. - virtual Status RestoreDBFromLatestBackup( - const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& options = RestoreOptions()) { - return RestoreDBFromLatestBackup(options, db_dir, wal_dir); - } - - // If verify_with_checksum is true, this function - // inspects the current checksums and file sizes of backup files to see if - // they match our expectation. - // - // If verify_with_checksum is false, this function - // checks that each file exists and that the size of the file matches our - // expectation. It does not check file checksum. - // - // If this BackupEngine created the backup, it compares the files' current - // sizes (and current checksum) against the number of bytes written to - // them (and the checksum calculated) during creation. - // Otherwise, it compares the files' current sizes (and checksums) against - // their sizes (and checksums) when the BackupEngine was opened. - // - // Returns Status::OK() if all checks are good - virtual Status VerifyBackup(BackupID backup_id, - bool verify_with_checksum = false) = 0; -}; - -// A backup engine for creating new backups. -// BackupEngine is not extensible. -class BackupEngine { - public: - virtual ~BackupEngine() {} - - // BackupableDBOptions have to be the same as the ones used in previous - // BackupEngines for the same backup directory. - static Status Open(const BackupableDBOptions& options, Env* db_env, - BackupEngine** backup_engine_ptr); - - // keep for backward compatibility. - static Status Open(Env* db_env, const BackupableDBOptions& options, - BackupEngine** backup_engine_ptr) { - return BackupEngine::Open(options, db_env, backup_engine_ptr); - } - - // same as CreateNewBackup, but stores extra application metadata. - virtual Status CreateNewBackupWithMetadata( - const CreateBackupOptions& options, DB* db, - const std::string& app_metadata) = 0; - - // keep here for backward compatibility. - virtual Status CreateNewBackupWithMetadata( - DB* db, const std::string& app_metadata, bool flush_before_backup = false, - std::function progress_callback = []() {}) { - CreateBackupOptions options; - options.flush_before_backup = flush_before_backup; - options.progress_callback = progress_callback; - return CreateNewBackupWithMetadata(options, db, app_metadata); - } - - // Captures the state of the database in the latest backup - // NOT a thread safe call - virtual Status CreateNewBackup(const CreateBackupOptions& options, DB* db) { - return CreateNewBackupWithMetadata(options, db, ""); - } - - // keep here for backward compatibility. - virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false, - std::function progress_callback = - []() {}) { - CreateBackupOptions options; - options.flush_before_backup = flush_before_backup; - options.progress_callback = progress_callback; - return CreateNewBackup(options, db); - } - - // Deletes old backups, keeping latest num_backups_to_keep alive. - // See also DeleteBackup. - virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; - - // Deletes a specific backup. If this operation (or PurgeOldBackups) - // is not completed due to crash, power failure, etc. the state - // will be cleaned up the next time you call DeleteBackup, - // PurgeOldBackups, or GarbageCollect. - virtual Status DeleteBackup(BackupID backup_id) = 0; - - // Call this from another thread if you want to stop the backup - // that is currently happening. It will return immediatelly, will - // not wait for the backup to stop. - // The backup will stop ASAP and the call to CreateNewBackup will - // return Status::Incomplete(). It will not clean up after itself, but - // the state will remain consistent. The state will be cleaned up the - // next time you call CreateNewBackup or GarbageCollect. - virtual void StopBackup() = 0; - - // Returns info about backups in backup_info - virtual void GetBackupInfo(std::vector* backup_info) = 0; - - // Returns info about corrupt backups in corrupt_backups - virtual void GetCorruptedBackups( - std::vector* corrupt_backup_ids) = 0; - - // restore from backup with backup_id - // IMPORTANT -- if options_.share_table_files == true, - // options_.share_files_with_checksum == false, you restore DB from some - // backup that is not the latest, and you start creating new backups from the - // new DB, they will probably fail. - // - // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. - // If you add new data to the DB and try creating a new backup now, the - // database will diverge from backups 4 and 5 and the new backup will fail. - // If you want to create new backup, you will first have to delete backups 4 - // and 5. - virtual Status RestoreDBFromBackup(const RestoreOptions& options, - BackupID backup_id, - const std::string& db_dir, - const std::string& wal_dir) = 0; - - // keep for backward compatibility. - virtual Status RestoreDBFromBackup( - BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& options = RestoreOptions()) { - return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); - } - - // restore from the latest backup - virtual Status RestoreDBFromLatestBackup(const RestoreOptions& options, - const std::string& db_dir, - const std::string& wal_dir) = 0; - - // keep for backward compatibility. - virtual Status RestoreDBFromLatestBackup( - const std::string& db_dir, const std::string& wal_dir, - const RestoreOptions& options = RestoreOptions()) { - return RestoreDBFromLatestBackup(options, db_dir, wal_dir); - } - - // If verify_with_checksum is true, this function - // inspects the current checksums and file sizes of backup files to see if - // they match our expectation. - // - // If verify_with_checksum is false, this function - // checks that each file exists and that the size of the file matches our - // expectation. It does not check file checksum. - // - // Returns Status::OK() if all checks are good - virtual Status VerifyBackup(BackupID backup_id, - bool verify_with_checksum = false) = 0; - - // Will delete any files left over from incomplete creation or deletion of - // a backup. This is not normally needed as those operations also clean up - // after prior incomplete calls to the same kind of operation (create or - // delete). - // NOTE: This is not designed to delete arbitrary files added to the backup - // directory outside of BackupEngine, and clean-up is always subject to - // permissions on and availability of the underlying filesystem. - virtual Status GarbageCollect() = 0; -}; +using BackupableDBOptions = BackupEngineOptions; } // namespace ROCKSDB_NAMESPACE + #endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h index 1b6a7407ff1..df2a744033e 100644 --- a/include/rocksdb/utilities/checkpoint.h +++ b/include/rocksdb/utilities/checkpoint.h @@ -24,13 +24,13 @@ class Checkpoint { // Creates a Checkpoint object to be used for creating openable snapshots static Status Create(DB* db, Checkpoint** checkpoint_ptr); - // Builds an openable snapshot of RocksDB on the same disk, which - // accepts an output directory on the same disk, and under the directory - // (1) hard-linked SST files pointing to existing live SST files - // SST files will be copied if output directory is on a different filesystem - // (2) a copied manifest files and other files - // The directory should not already exist and will be created by this API. - // The directory will be an absolute path + // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an + // absolute path. The specified directory should not exist, since it will be + // created by the API. + // When a checkpoint is created, + // (1) SST and blob files are hard linked if the output directory is on the + // same filesystem as the database, and copied otherwise. + // (2) other required files (like MANIFEST) are always copied. // log_size_for_flush: if the total log file size is equal or larger than // this value, then a flush is triggered for all the column families. The // default value is 0, which means flush is always triggered. If you move diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index c7f227fc02a..e900abefee5 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -32,6 +32,7 @@ class LDBCommand { public: // Command-line arguments static const std::string ARG_ENV_URI; + static const std::string ARG_FS_URI; static const std::string ARG_DB; static const std::string ARG_PATH; static const std::string ARG_SECONDARY_PATH; @@ -88,6 +89,8 @@ class LDBCommand { virtual void OverrideBaseOptions(); + virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts); + virtual void SetDBOptions(Options options) { options_ = options; } virtual void SetColumnFamilies( @@ -135,6 +138,7 @@ class LDBCommand { protected: LDBCommandExecuteResult exec_state_; std::string env_uri_; + std::string fs_uri_; std::string db_path_; // If empty, open DB as primary. If non-empty, open the DB as secondary // with this secondary path. When running against a database opened by diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h index 538cb6a8fd4..5a454d7755f 100644 --- a/include/rocksdb/utilities/object_registry.h +++ b/include/rocksdb/utilities/object_registry.h @@ -17,12 +17,23 @@ namespace ROCKSDB_NAMESPACE { class Logger; +class ObjectLibrary; + // Returns a new T when called with a string. Populates the std::unique_ptr // argument if granting ownership to caller. template using FactoryFunc = std::function*, std::string*)>; +// The signature of the function for loading factories +// into an object library. This method is expected to register +// factory functions in the supplied ObjectLibrary. +// The ObjectLibrary is the library in which the factories will be loaded. +// The std::string is the argument passed to the loader function. +// The RegistrarFunc should return the number of objects loaded into this +// library +using RegistrarFunc = std::function; + class ObjectLibrary { public: // Base class for an Entry in the Registry. @@ -62,9 +73,18 @@ class ObjectLibrary { FactoryFunc factory_; }; // End class FactoryEntry public: + explicit ObjectLibrary(const std::string& id) { id_ = id; } + + const std::string& GetID() const { return id_; } // Finds the entry matching the input name and type const Entry* FindEntry(const std::string& type, const std::string& name) const; + + // Returns the total number of factories registered for this library. + // This method returns the sum of all factories registered for all types. + // @param num_types returns how many unique types are registered. + size_t GetFactoryCount(size_t* num_types) const; + void Dump(Logger* logger) const; // Registers the factory with the library for the pattern. @@ -76,6 +96,12 @@ class ObjectLibrary { AddEntry(T::Type(), entry); return factory; } + + // Invokes the registrar function with the supplied arg for this library. + int Register(const RegistrarFunc& registrar, const std::string& arg) { + return registrar(*this, arg); + } + // Returns the default ObjectLibrary static std::shared_ptr& Default(); @@ -85,6 +111,9 @@ class ObjectLibrary { // ** FactoryFunctions for this loader, organized by type std::unordered_map>> entries_; + + // The name for this library + std::string id_; }; // The ObjectRegistry is used to register objects that can be created by a @@ -93,11 +122,26 @@ class ObjectLibrary { class ObjectRegistry { public: static std::shared_ptr NewInstance(); - - ObjectRegistry(); + static std::shared_ptr NewInstance( + const std::shared_ptr& parent); + static std::shared_ptr Default(); + explicit ObjectRegistry(const std::shared_ptr& parent) + : parent_(parent) {} + + std::shared_ptr AddLibrary(const std::string& id) { + auto library = std::make_shared(id); + libraries_.push_back(library); + return library; + } void AddLibrary(const std::shared_ptr& library) { - libraries_.emplace_back(library); + libraries_.push_back(library); + } + + void AddLibrary(const std::string& id, const RegistrarFunc& registrar, + const std::string& arg) { + auto library = AddLibrary(id); + library->Register(registrar, arg); } // Creates a new T using the factory function that was registered with a @@ -193,6 +237,10 @@ class ObjectRegistry { void Dump(Logger* logger) const; private: + explicit ObjectRegistry(const std::shared_ptr& library) { + libraries_.push_back(library); + } + const ObjectLibrary::Entry* FindEntry(const std::string& type, const std::string& name) const; @@ -200,6 +248,7 @@ class ObjectRegistry { // The libraries are searched in reverse order (back to front) when // searching for entries. std::vector> libraries_; + std::shared_ptr parent_; }; } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 5356df71f39..c070e49a309 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -51,6 +51,8 @@ struct OptimisticTransactionDBOptions { uint32_t occ_lock_buckets = (1 << 20); }; +// Range deletions (including those in `WriteBatch`es passed to `Write()`) are +// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status` class OptimisticTransactionDB : public StackableDB { public: // Open an OptimisticTransactionDB similar to DB::Open(). diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h index 2bd081abfb0..7057c78ac20 100644 --- a/include/rocksdb/utilities/options_type.h +++ b/include/rocksdb/utilities/options_type.h @@ -25,6 +25,7 @@ enum class OptionType { kInt32T, kInt64T, kUInt, + kUInt8T, kUInt32T, kUInt64T, kSizeT, @@ -49,6 +50,8 @@ enum class OptionType { kStruct, kVector, kConfigurable, + kCustomizable, + kEncodedString, kUnknown, }; @@ -93,13 +96,14 @@ enum class OptionTypeFlags : uint32_t { kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible, kCompareExact = ConfigOptions::kSanityLevelExactMatch, - kMutable = 0x0100, // Option is mutable - kRawPointer = 0x0200, // The option is stored as a raw pointer - kShared = 0x0400, // The option is stored as a shared_ptr - kUnique = 0x0800, // The option is stored as a unique_ptr - kAllowNull = 0x1000, // The option can be null - kDontSerialize = 0x2000, // Don't serialize the option - kDontPrepare = 0x4000, // Don't prepare or sanitize this option + kMutable = 0x0100, // Option is mutable + kRawPointer = 0x0200, // The option is stored as a raw pointer + kShared = 0x0400, // The option is stored as a shared_ptr + kUnique = 0x0800, // The option is stored as a unique_ptr + kAllowNull = 0x1000, // The option can be null + kDontSerialize = 0x2000, // Don't serialize the option + kDontPrepare = 0x4000, // Don't prepare or sanitize this option + kStringNameOnly = 0x8000, // The option serializes to a name only }; inline OptionTypeFlags operator|(const OptionTypeFlags &a, @@ -174,7 +178,7 @@ bool VectorsAreEqual(const ConfigOptions& config_options, // @param addr Pointer to the object using ParseFunc = std::function; + const std::string& /*value*/, void* /*addr*/)>; // Function for converting an option "addr" into its string representation. // On success, Status::OK is returned and value is the serialized form. @@ -185,7 +189,7 @@ using ParseFunc = std::function; + const void* /*addr*/, std::string* /*value*/)>; // Function for comparing two option values // If they are not equal, updates "mismatch" with the name of the bad option @@ -197,7 +201,7 @@ using SerializeFunc = std::function; + const void* /*addr1*/, const void* /*addr2*/, std::string* mismatch)>; // A struct for storing constant option information such as option name, // option type, and offset. @@ -253,7 +257,7 @@ class OptionTypeInfo { // - Create a static map of string values to the corresponding enum value // - Call this method passing the static map in as a parameter. // Note that it is not necessary to add a new OptionType or make any - // other changes -- the returned object handles parsing, serialiation, and + // other changes -- the returned object handles parsing, serialization, and // comparisons. // // @param offset The offset in the option object for this enum @@ -270,10 +274,10 @@ class OptionTypeInfo { // @return OK if the value is found in the map // @return InvalidArgument if the value is not found in the map [map](const ConfigOptions&, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { if (map == nullptr) { return Status::NotSupported("No enum mapping ", name); - } else if (ParseEnum(*map, value, reinterpret_cast(addr))) { + } else if (ParseEnum(*map, value, static_cast(addr))) { return Status::OK(); } else { return Status::InvalidArgument("No mapping for enum ", name); @@ -284,11 +288,11 @@ class OptionTypeInfo { // value is updated to the corresponding string value in the map. // @return OK if the enum is found in the map // @return InvalidArgument if the enum is not found in the map - [map](const ConfigOptions&, const std::string& name, const char* addr, + [map](const ConfigOptions&, const std::string& name, const void* addr, std::string* value) { if (map == nullptr) { return Status::NotSupported("No enum mapping ", name); - } else if (SerializeEnum(*map, (*reinterpret_cast(addr)), + } else if (SerializeEnum(*map, (*static_cast(addr)), value)) { return Status::OK(); } else { @@ -297,10 +301,10 @@ class OptionTypeInfo { }, // Casts addr1 and addr2 to the enum type and returns true if // they are equal, false otherwise. - [](const ConfigOptions&, const std::string&, const char* addr1, - const char* addr2, std::string*) { - return (*reinterpret_cast(addr1) == - *reinterpret_cast(addr2)); + [](const ConfigOptions&, const std::string&, const void* addr1, + const void* addr2, std::string*) { + return (*static_cast(addr1) == + *static_cast(addr2)); }); } // End OptionTypeInfo::Enum @@ -335,20 +339,20 @@ class OptionTypeInfo { // Parses the struct and updates the fields at addr [struct_name, struct_map](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { return ParseStruct(opts, struct_name, struct_map, name, value, addr); }, // Serializes the struct options into value [struct_name, struct_map](const ConfigOptions& opts, - const std::string& name, const char* addr, + const std::string& name, const void* addr, std::string* value) { return SerializeStruct(opts, struct_name, struct_map, name, addr, value); }, // Compares the struct fields of addr1 and addr2 for equality [struct_name, struct_map](const ConfigOptions& opts, - const std::string& name, const char* addr1, - const char* addr2, std::string* mismatch) { + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { return StructsAreEqual(opts, struct_name, struct_map, name, addr1, addr2, mismatch); }); @@ -361,14 +365,14 @@ class OptionTypeInfo { return OptionTypeInfo( offset, OptionType::kStruct, verification, flags, parse_func, [struct_name, struct_map](const ConfigOptions& opts, - const std::string& name, const char* addr, + const std::string& name, const void* addr, std::string* value) { return SerializeStruct(opts, struct_name, struct_map, name, addr, value); }, [struct_name, struct_map](const ConfigOptions& opts, - const std::string& name, const char* addr1, - const char* addr2, std::string* mismatch) { + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { return StructsAreEqual(opts, struct_name, struct_map, name, addr1, addr2, mismatch); }); @@ -384,30 +388,134 @@ class OptionTypeInfo { _offset, OptionType::kVector, _verification, _flags, [elem_info, separator](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { - auto result = reinterpret_cast*>(addr); + const std::string& value, void* addr) { + auto result = static_cast*>(addr); return ParseVector(opts, elem_info, separator, name, value, result); }, [elem_info, separator](const ConfigOptions& opts, - const std::string& name, const char* addr, + const std::string& name, const void* addr, std::string* value) { - const auto& vec = *(reinterpret_cast*>(addr)); + const auto& vec = *(static_cast*>(addr)); return SerializeVector(opts, elem_info, separator, name, vec, value); }, [elem_info](const ConfigOptions& opts, const std::string& name, - const char* addr1, const char* addr2, + const void* addr1, const void* addr2, std::string* mismatch) { - const auto& vec1 = *(reinterpret_cast*>(addr1)); - const auto& vec2 = *(reinterpret_cast*>(addr2)); + const auto& vec1 = *(static_cast*>(addr1)); + const auto& vec2 = *(static_cast*>(addr2)); return VectorsAreEqual(opts, elem_info, name, vec1, vec2, mismatch); }); } + // Create a new std::shared_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::shared_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + return AsCustomSharedPtr(offset, ovt, flags, nullptr, nullptr); + } + + template + static OptionTypeInfo AsCustomSharedPtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + return OptionTypeInfo( + offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kShared, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* shared = static_cast*>(addr); + return T::CreateFromString(opts, value, shared); + }, + serialize_func, equals_func); + } + + // Create a new std::unique_ptr OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // std::unique_ptr object. + // + // @param offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags) { + return AsCustomUniquePtr(offset, ovt, flags, nullptr, nullptr); + } + + template + static OptionTypeInfo AsCustomUniquePtr(int offset, + OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + return OptionTypeInfo( + offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kUnique, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* unique = static_cast*>(addr); + return T::CreateFromString(opts, value, unique); + }, + serialize_func, equals_func); + } + + // Create a new Customizable* OptionTypeInfo + // This function will call the T::CreateFromString method to create a new + // T object. + // + // @param _offset The offset for the Customizable from the base pointer + // @param ovt How to verify this option + // @param flags, Extra flags specifying the behavior of this option + // @param _sfunc Optional function for serializing this option + // @param _efunc Optional function for comparing this option + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags) { + return AsCustomRawPtr(offset, ovt, flags, nullptr, nullptr); + } + + template + static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt, + OptionTypeFlags flags, + const SerializeFunc& serialize_func, + const EqualsFunc& equals_func) { + return OptionTypeInfo( + offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kRawPointer, + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto** pointer = static_cast(addr); + return T::CreateFromString(opts, value, pointer); + }, + serialize_func, equals_func); + } + bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; } + bool IsEditable(const ConfigOptions& opts) const { + if (opts.mutable_options_only) { + return IsMutable(); + } else { + return true; + } + } bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); } bool IsDeprecated() const { @@ -475,7 +583,12 @@ class OptionTypeInfo { bool IsStruct() const { return (type_ == OptionType::kStruct); } - bool IsConfigurable() const { return (type_ == OptionType::kConfigurable); } + bool IsConfigurable() const { + return (type_ == OptionType::kConfigurable || + type_ == OptionType::kCustomizable); + } + + bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); } // Returns the underlying pointer for the type at base_addr // The value returned is the underlying "raw" pointer, offset from base. @@ -484,20 +597,20 @@ class OptionTypeInfo { if (base_addr == nullptr) { return nullptr; } - const auto opt_addr = reinterpret_cast(base_addr) + offset_; + const void* opt_addr = static_cast(base_addr) + offset_; if (IsUniquePtr()) { const std::unique_ptr* ptr = - reinterpret_cast*>(opt_addr); + static_cast*>(opt_addr); return ptr->get(); } else if (IsSharedPtr()) { const std::shared_ptr* ptr = - reinterpret_cast*>(opt_addr); + static_cast*>(opt_addr); return ptr->get(); } else if (IsRawPtr()) { - const T* const* ptr = reinterpret_cast(opt_addr); + const T* const* ptr = static_cast(opt_addr); return *ptr; } else { - return reinterpret_cast(opt_addr); + return static_cast(opt_addr); } } @@ -508,18 +621,18 @@ class OptionTypeInfo { if (base_addr == nullptr) { return nullptr; } - auto opt_addr = reinterpret_cast(base_addr) + offset_; + void* opt_addr = static_cast(base_addr) + offset_; if (IsUniquePtr()) { - std::unique_ptr* ptr = reinterpret_cast*>(opt_addr); + std::unique_ptr* ptr = static_cast*>(opt_addr); return ptr->get(); } else if (IsSharedPtr()) { - std::shared_ptr* ptr = reinterpret_cast*>(opt_addr); + std::shared_ptr* ptr = static_cast*>(opt_addr); return ptr->get(); } else if (IsRawPtr()) { - T** ptr = reinterpret_cast(opt_addr); + T** ptr = static_cast(opt_addr); return *ptr; } else { - return reinterpret_cast(opt_addr); + return static_cast(opt_addr); } } @@ -553,6 +666,37 @@ class OptionTypeInfo { const std::string& opt_name, const void* const this_ptr, const std::string& that_value) const; + // Parses the input opts_map according to the type_map for the opt_addr + // For each name-value pair in opts_map, find the corresponding name in + // type_map If the name is found: + // - set the corresponding value in opt_addr, returning the status on + // failure; + // If the name is not found: + // - If unused is specified, add the name-value to unused and continue + // - If ingore_unknown_options is false, return NotFound + // Returns OK if all options were either: + // - Successfully set + // - options were not found and ignore_unknown_options=true + // - options were not found and unused was specified + // Note that this method is much less sophisticated than the comparable + // Configurable::Configure methods. For example, on error, there is no + // attempt to return opt_addr to the initial state. Additionally, there + // is no effort to initialize (Configurable::PrepareOptions) the object + // on success. This method should typically only be used for simpler, + // standalone structures and not those that contain shared and embedded + // objects. + static Status ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + static Status ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, + std::unordered_map* unused = nullptr); + // Parses the input value according to the map for the struct at opt_addr // struct_name is the name of the struct option as registered // opt_name is the name of the option being evaluated. This may @@ -561,7 +705,15 @@ class OptionTypeInfo { static Status ParseStruct( const ConfigOptions& config_options, const std::string& struct_name, const std::unordered_map* map, - const std::string& opt_name, const std::string& value, char* opt_addr); + const std::string& opt_name, const std::string& value, void* opt_addr); + + // Serializes the values from opt_addr using the rules in type_map. + // Returns the serialized form in result. + // Returns OK on success or non-OK if some option could not be serialized. + static Status SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* value); // Serializes the input addr according to the map for the struct to value. // struct_name is the name of the struct option as registered @@ -570,7 +722,16 @@ class OptionTypeInfo { static Status SerializeStruct( const ConfigOptions& config_options, const std::string& struct_name, const std::unordered_map* map, - const std::string& opt_name, const char* opt_addr, std::string* value); + const std::string& opt_name, const void* opt_addr, std::string* value); + + // Compares the values in this_addr and that_addr using the rules in type_map. + // If the values are equal, returns true + // If the values are not equal, returns false and sets mismatch to the name + // of the first value that did not match. + static bool TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& map, + const void* this_addr, const void* that_addr, std::string* mismatch); // Compares the input offsets according to the map for the struct and returns // true if they are equivalent, false otherwise. @@ -580,8 +741,8 @@ class OptionTypeInfo { static bool StructsAreEqual( const ConfigOptions& config_options, const std::string& struct_name, const std::unordered_map* map, - const std::string& opt_name, const char* this_offset, - const char* that_offset, std::string* mismatch); + const std::string& opt_name, const void* this_offset, + const void* that_offset, std::string* mismatch); // Finds the entry for the opt_name in the opt_map, returning // nullptr if not found. @@ -607,7 +768,7 @@ class OptionTypeInfo { // @param opts The string in which to find the next token // @param delimiter The delimiter between tokens // @param start The position in opts to start looking for the token - // @parem ed Returns the end position in opts of the token + // @param ed Returns the end position in opts of the token // @param token Returns the token // @returns OK if a token was found // @return InvalidArgument if the braces mismatch @@ -660,6 +821,10 @@ Status ParseVector(const ConfigOptions& config_options, result->clear(); Status status; + // Turn off ignore_unknown_objects so we can tell if the returned + // object is valid or not. + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; for (size_t start = 0, end = 0; status.ok() && start < value.size() && end != std::string::npos; start = end + 1) { @@ -667,10 +832,14 @@ Status ParseVector(const ConfigOptions& config_options, status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); if (status.ok()) { T elem; - status = elem_info.Parse(config_options, name, token, - reinterpret_cast(&elem)); + status = elem_info.Parse(copy, name, token, &elem); if (status.ok()) { result->emplace_back(elem); + } else if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); } } } @@ -739,7 +908,7 @@ Status SerializeVector(const ConfigOptions& config_options, // @param vec1,vec2 The vectors to compare. // @param mismatch If the vectors are not equivalent, mismatch will point to // the first -// element of the comparison tht did not match. +// element of the comparison that did not match. // @return true If vec1 and vec2 are "equal", false otherwise template bool VectorsAreEqual(const ConfigOptions& config_options, diff --git a/include/rocksdb/utilities/options_util.h b/include/rocksdb/utilities/options_util.h index 681b42cfdf9..064c087f05c 100644 --- a/include/rocksdb/utilities/options_util.h +++ b/include/rocksdb/utilities/options_util.h @@ -51,7 +51,7 @@ struct ConfigOptions; // BlockBasedTableOptions and making necessary changes. // // ignore_unknown_options can be set to true if you want to ignore options -// that are from a newer version of the db, esentially for forward +// that are from a newer version of the db, essentially for forward // compatibility. // // config_options contains a set of options that controls the processing @@ -66,7 +66,7 @@ struct ConfigOptions; // @return the function returns an OK status when it went successfully. If // the specified "dbpath" does not contain any option file, then a // Status::NotFound will be returned. A return value other than -// Status::OK or Status::NotFound indicates there're some error related +// Status::OK or Status::NotFound indicates there is some error related // to the options file itself. // // @see LoadOptionsFromFile diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h index ba6f1d74894..17143916b30 100644 --- a/include/rocksdb/utilities/sim_cache.h +++ b/include/rocksdb/utilities/sim_cache.h @@ -25,7 +25,7 @@ class SimCache; // can help users tune their current block cache size, and determine how // efficient they are using the memory. // -// Since GetSimCapacity() returns the capacity for simulutation, it differs from +// Since GetSimCapacity() returns the capacity for simulation, it differs from // actual memory usage, which can be estimated as: // sim_capacity * entry_size / (entry_size + block_size), // where 76 <= entry_size <= 104, @@ -60,7 +60,7 @@ class SimCache : public Cache { // sets the maximum configured capacity of the simcache. When the new // capacity is less than the old capacity and the existing usage is // greater than new capacity, the implementation will purge old entries - // to fit new capapicty. + // to fit new capacity. virtual void SetSimCapacity(size_t capacity) = 0; // returns the lookup times of simcache diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index b785961f343..5cecc80c905 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -141,6 +141,11 @@ class StackableDB : public DB { import_options, metadata, handle); } + using DB::VerifyFileChecksums; + Status VerifyFileChecksums(const ReadOptions& read_opts) override { + return db_->VerifyFileChecksums(read_opts); + } + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } virtual Status VerifyChecksum(const ReadOptions& options) override { @@ -367,6 +372,24 @@ class StackableDB : public DB { using DB::EndBlockCacheTrace; Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + using DB::StartIOTrace; + Status StartIOTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartIOTrace(options, std::move(trace_writer)); + } + + using DB::EndIOTrace; + Status EndIOTrace() override { return db_->EndIOTrace(); } + + using DB::StartTrace; + Status StartTrace(const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartTrace(options, std::move(trace_writer)); + } + + using DB::EndTrace; + Status EndTrace() override { return db_->EndTrace(); } + #endif // ROCKSDB_LITE virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 6ebdbcc408b..dd7dd998ab3 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -24,6 +24,83 @@ using TransactionName = std::string; using TransactionID = uint64_t; +/* + class Endpoint allows to define prefix ranges. + + Prefix ranges are introduced below. + + == Basic Ranges == + Let's start from basic ranges. Key Comparator defines ordering of rowkeys. + Then, one can specify finite closed ranges by just providing rowkeys of their + endpoints: + + lower_endpoint <= X <= upper_endpoint + + However our goal is to provide a richer set of endpoints. Read on. + + == Lexicographic ordering == + A lexicographic (or dictionary) ordering satisfies these criteria: If there + are two keys in form + key_a = {prefix_a, suffix_a} + key_b = {prefix_b, suffix_b} + and + prefix_a < prefix_b + then + key_a < key_b. + + == Prefix ranges == + With lexicographic ordering, one may want to define ranges in form + + "prefix is $PREFIX" + + which translates to a range in form + + {$PREFIX, -infinity} < X < {$PREFIX, +infinity} + + where -infinity will compare less than any possible suffix, and +infinity + will compare as greater than any possible suffix. + + class Endpoint allows to define these kind of rangtes. + + == Notes == + BytewiseComparator and ReverseBytewiseComparator produce lexicographic + ordering. + + The row comparison function is able to compare key prefixes. If the data + domain includes keys A and B, then the comparison function is able to compare + equal-length prefixes: + + min_len= min(byte_length(A), byte_length(B)); + cmp(Slice(A, min_len), Slice(B, min_len)); // this call is valid + + == Other options == + As far as MyRocks is concerned, the alternative to prefix ranges would be to + support both open (non-inclusive) and closed (inclusive) range endpoints. +*/ + +class Endpoint { + public: + Slice slice; + + /* + true : the key has a "+infinity" suffix. A suffix that would compare as + greater than any other suffix + false : otherwise + */ + bool inf_suffix; + + explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false) + : slice(slice_arg), inf_suffix(inf_suffix_arg) {} + + explicit Endpoint(const char* s, bool inf_suffix_arg = false) + : slice(s), inf_suffix(inf_suffix_arg) {} + + Endpoint(const char* s, size_t size, bool inf_suffix_arg = false) + : slice(s, size), inf_suffix(inf_suffix_arg) {} + + Endpoint() : inf_suffix(false) {} +}; + // Provides notification to the caller of SetSnapshotOnNextOperation when // the actual snapshot gets created class TransactionNotifier { @@ -277,6 +354,12 @@ class Transaction { } } + // Get a range lock on [start_endpoint; end_endpoint]. + virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&, + const Endpoint&) { + return Status::NotSupported(); + } + virtual Status GetForUpdate(const ReadOptions& options, const Slice& key, std::string* value, bool exclusive = true, const bool do_validate = true) = 0; diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 8967b7eefab..265d4b79a09 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -31,6 +31,102 @@ enum TxnDBWritePolicy { const uint32_t kInitialMaxDeadlocks = 5; +class LockManager; +struct RangeLockInfo; + +// A lock manager handle +// The workflow is as follows: +// * Use a factory method (like NewRangeLockManager()) to create a lock +// manager and get its handle. +// * A Handle for a particular kind of lock manager will have extra +// methods and parameters to control the lock manager +// * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It +// will be used to perform locking. +class LockManagerHandle { + public: + // PessimisticTransactionDB will call this to get the Lock Manager it's going + // to use. + virtual LockManager* getLockManager() = 0; + + virtual ~LockManagerHandle() {} +}; + +// Same as class Endpoint, but use std::string to manage the buffer allocation +struct EndpointWithString { + std::string slice; + bool inf_suffix; +}; + +struct RangeDeadlockInfo { + TransactionID m_txn_id; + uint32_t m_cf_id; + bool m_exclusive; + + EndpointWithString m_start; + EndpointWithString m_end; +}; + +struct RangeDeadlockPath { + std::vector path; + bool limit_exceeded; + int64_t deadlock_time; + + explicit RangeDeadlockPath(std::vector path_entry, + const int64_t& dl_time) + : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} + + // empty path, limit exceeded constructor and default constructor + explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false) + : path(0), limit_exceeded(limit), deadlock_time(dl_time) {} + + bool empty() { return path.empty() && !limit_exceeded; } +}; + +// A handle to control RangeLockManager (Range-based lock manager) from outside +// RocksDB +class RangeLockManagerHandle : public LockManagerHandle { + public: + // Set total amount of lock memory to use. + // + // @return 0 Ok + // @return EDOM Failed to set because currently using more memory than + // specified + virtual int SetMaxLockMemory(size_t max_lock_memory) = 0; + virtual size_t GetMaxLockMemory() = 0; + + using RangeLockStatus = + std::unordered_multimap; + + virtual RangeLockStatus GetRangeLockStatusData() = 0; + + class Counters { + public: + // Number of times lock escalation was triggered (for all column families) + uint64_t escalation_count; + + // How much memory is currently used for locks (total for all column + // families) + uint64_t current_lock_memory; + }; + + // Get the current counter values + virtual Counters GetStatus() = 0; + + // Functions for range-based Deadlock reporting. + virtual std::vector GetRangeDeadlockInfoBuffer() = 0; + virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0; + + virtual ~RangeLockManagerHandle() {} +}; + +// A factory function to create a Range Lock Manager. The created object should +// be: +// 1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in +// range-locking mode +// 2. Used to control the lock manager when the DB is already open. +RangeLockManagerHandle* NewRangeLockManager( + std::shared_ptr mutex_factory); + struct TransactionDBOptions { // Specifies the maximum number of keys that can be locked at the same time // per column family. @@ -92,9 +188,13 @@ struct TransactionDBOptions { // for the special way that myrocks uses this operands. bool rollback_merge_operands = false; + // nullptr means use default lock manager. + // Other value means the user provides a custom lock manager. + std::shared_ptr lock_mgr_handle; + // If true, the TransactionDB implementation might skip concurrency control // unless it is overridden by TransactionOptions or - // TransactionDBWriteOptimizations. This can be used in conjuction with + // TransactionDBWriteOptimizations. This can be used in conjunction with // DBOptions::unordered_write when the TransactionDB is used solely for write // ordering rather than concurrency control. bool skip_concurrency_control = false; @@ -202,6 +302,13 @@ struct KeyLockInfo { bool exclusive; }; +struct RangeLockInfo { + EndpointWithString start; + EndpointWithString end; + std::vector ids; + bool exclusive; +}; + struct DeadlockInfo { TransactionID m_txn_id; uint32_t m_cf_id; @@ -237,6 +344,17 @@ class TransactionDB : public StackableDB { // falls back to the un-optimized version of ::Write return Write(opts, updates); } + // Transactional `DeleteRange()` is not yet supported. + // However, users who know their deleted range does not conflict with + // anything can still use it via the `Write()` API. In all cases, the + // `Write()` overload specifying `TransactionDBWriteOptimizations` must be + // used and `skip_concurrency_control` must be set. When using either + // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must + // additionally be set. + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } // Open a TransactionDB similar to DB::Open(). // Internally call PrepareWrap() and WrapDB() // If the return status is not ok, then dbptr is set to nullptr. @@ -296,6 +414,7 @@ class TransactionDB : public StackableDB { // The mapping is column family id -> KeyLockInfo virtual std::unordered_multimap GetLockStatusData() = 0; + virtual std::vector GetDeadlockInfoBuffer() = 0; virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; diff --git a/include/rocksdb/utilities/transaction_db_mutex.h b/include/rocksdb/utilities/transaction_db_mutex.h index 96a42adf8c5..13d899c3226 100644 --- a/include/rocksdb/utilities/transaction_db_mutex.h +++ b/include/rocksdb/utilities/transaction_db_mutex.h @@ -61,7 +61,7 @@ class TransactionDBCondVar { // // Returns OK if notified. // Returns TimedOut if timeout is reached. - // Returns other status if TransactionDB should otherwis stop waiting and + // Returns other status if TransactionDB should otherwise stop waiting and // fail the operation. // May return OK spuriously even if not notified. virtual Status WaitFor(std::shared_ptr mutex, diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 424aa158239..65feaa7b8ae 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -40,12 +40,13 @@ enum WriteType { kDeleteRangeRecord, kLogDataRecord, kXIDRecord, + kUnknownRecord, }; // an entry for Put, Merge, Delete, or SingleDelete entry for write batches. // Used in WBWIIterator. struct WriteEntry { - WriteType type; + WriteType type = kUnknownRecord; Slice key; Slice value; }; @@ -168,7 +169,7 @@ class WriteBatchWithIndex : public WriteBatchBase { // returned iterator will also delete the base_iterator. // // Updating write batch with the current key of the iterator is not safe. - // We strongly recommand users not to do it. It will invalidate the current + // We strongly recommend users not to do it. It will invalidate the current // key() and value() of the iterator. This invalidation happens even before // the write batch update finishes. The state may recover after Next() is // called. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index c2631d0722b..ebdfb7468f1 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -4,9 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + #define ROCKSDB_MAJOR 6 -#define ROCKSDB_MINOR 14 -#define ROCKSDB_PATCH 6 +#define ROCKSDB_MINOR 22 +#define ROCKSDB_PATCH 1 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these @@ -14,3 +19,23 @@ #define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR #define __ROCKSDB_MINOR__ ROCKSDB_MINOR #define __ROCKSDB_PATCH__ ROCKSDB_PATCH + +namespace ROCKSDB_NAMESPACE { +// Returns a set of properties indicating how/when/where this version of RocksDB +// was created. +const std::unordered_map& GetRocksBuildProperties(); + +// Returns the current version of RocksDB as a string (e.g. "6.16.0"). +// If with_patch is true, the patch is included (6.16.x). +// Otherwise, only major and minor version is included (6.16) +std::string GetRocksVersionAsString(bool with_patch = true); + +// Gets the set of build properties (@see GetRocksBuildProperties) into a +// string. Properties are returned one-per-line, with the first line being: +// " from RocksDB . +// If verbose is true, the full set of properties is +// printed. If verbose is false, only the version information (@see +// GetRocksVersionString) is printed. +std::string GetRocksBuildInfoAsString(const std::string& program, + bool verbose = false); +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 51fd4d8ac80..d47c435bf4f 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -62,6 +62,11 @@ class WriteBatch : public WriteBatchBase { public: explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0); explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz); + // `protection_bytes_per_key` is the number of bytes used to store + // protection information for each key entry. Currently supported values are + // zero (disabled) and eight. + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz, + size_t protection_bytes_per_key); ~WriteBatch() override; using WriteBatchBase::Put; @@ -307,10 +312,10 @@ class WriteBatch : public WriteBatchBase { // Returns true if MarkEndPrepare will be called during Iterate bool HasEndPrepare() const; - // Returns trie if MarkCommit will be called during Iterate + // Returns true if MarkCommit will be called during Iterate bool HasCommit() const; - // Returns trie if MarkRollback will be called during Iterate + // Returns true if MarkRollback will be called during Iterate bool HasRollback() const; // Assign timestamp to write batch @@ -338,6 +343,9 @@ class WriteBatch : public WriteBatchBase { void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; } + struct ProtectionInfo; + size_t GetProtectionBytesPerKey() const; + private: friend class WriteBatchInternal; friend class LocalSavePoint; @@ -367,11 +375,11 @@ class WriteBatch : public WriteBatchBase { // more details. bool is_latest_persistent_state_ = false; + std::unique_ptr prot_info_; + protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ const size_t timestamp_size_; - - // Intentionally copyable }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h index ae1c98cafb2..67aef7f8fe7 100644 --- a/include/rocksdb/write_buffer_manager.h +++ b/include/rocksdb/write_buffer_manager.h @@ -13,46 +13,96 @@ #pragma once #include +#include #include +#include +#include + #include "rocksdb/cache.h" namespace ROCKSDB_NAMESPACE { +// Interface to block and signal DB instances. +// Each DB instance contains ptr to StallInterface. +class StallInterface { + public: + virtual ~StallInterface() {} + + virtual void Block() = 0; + + virtual void Signal() = 0; +}; + class WriteBufferManager { public: - // _buffer_size = 0 indicates no limit. Memory won't be capped. + // Parameters: + // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped. // memory_usage() won't be valid and ShouldFlush() will always return true. - // if `cache` is provided, we'll put dummy entries in the cache and cost - // the memory allocated to the cache. It can be used even if _buffer_size = 0. + // + // cache_: if `cache` is provided, we'll put dummy entries in the cache and + // cost the memory allocated to the cache. It can be used even if _buffer_size + // = 0. + // + // allow_stall: if set true, it will enable stalling of writes when + // memory_usage() exceeds buffer_size. It will wait for flush to complete and + // memory usage to drop down. explicit WriteBufferManager(size_t _buffer_size, - std::shared_ptr cache = {}); + std::shared_ptr cache = {}, + bool allow_stall = false); // No copying allowed WriteBufferManager(const WriteBufferManager&) = delete; WriteBufferManager& operator=(const WriteBufferManager&) = delete; ~WriteBufferManager(); - bool enabled() const { return buffer_size_ != 0; } + // Returns true if buffer_limit is passed to limit the total memory usage and + // is greater than 0. + bool enabled() const { return buffer_size() > 0; } + // Returns true if pointer to cache is passed. bool cost_to_cache() const { return cache_rep_ != nullptr; } + // Returns the total memory used by memtables. // Only valid if enabled() size_t memory_usage() const { return memory_used_.load(std::memory_order_relaxed); } + + // Returns the total memory used by active memtables. size_t mutable_memtable_memory_usage() const { return memory_active_.load(std::memory_order_relaxed); } - size_t buffer_size() const { return buffer_size_; } + + size_t dummy_entries_in_cache_usage() const { + return dummy_size_.load(std::memory_order_relaxed); + } + + // Returns the buffer_size. + size_t buffer_size() const { + return buffer_size_.load(std::memory_order_relaxed); + } + + void SetBufferSize(size_t new_size) { + buffer_size_.store(new_size, std::memory_order_relaxed); + mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); + // Check if stall is active and can be ended. + if (allow_stall_) { + EndWriteStall(); + } + } + + // Below functions should be called by RocksDB internally. // Should only be called from write thread bool ShouldFlush() const { if (enabled()) { - if (mutable_memtable_memory_usage() > mutable_limit_) { + if (mutable_memtable_memory_usage() > + mutable_limit_.load(std::memory_order_relaxed)) { return true; } - if (memory_usage() >= buffer_size_ && - mutable_memtable_memory_usage() >= buffer_size_ / 2) { + size_t local_size = buffer_size(); + if (memory_usage() >= local_size && + mutable_memtable_memory_usage() >= local_size / 2) { // If the memory exceeds the buffer size, we trigger more aggressive // flush. But if already more than half memory is being flushed, // triggering more flush may not help. We will hold it instead. @@ -62,39 +112,65 @@ class WriteBufferManager { return false; } - void ReserveMem(size_t mem) { - if (cache_rep_ != nullptr) { - ReserveMemWithCache(mem); - } else if (enabled()) { - memory_used_.fetch_add(mem, std::memory_order_relaxed); - } - if (enabled()) { - memory_active_.fetch_add(mem, std::memory_order_relaxed); + // Returns true if total memory usage exceeded buffer_size. + // We stall the writes untill memory_usage drops below buffer_size. When the + // function returns true, all writer threads (including one checking this + // condition) across all DBs will be stalled. Stall is allowed only if user + // pass allow_stall = true during WriteBufferManager instance creation. + // + // Should only be called by RocksDB internally . + bool ShouldStall() { + if (allow_stall_ && enabled()) { + if (IsStallActive()) { + return true; + } + if (IsStallThresholdExceeded()) { + stall_active_.store(true, std::memory_order_relaxed); + return true; + } } + return false; } + + // Returns true if stall is active. + bool IsStallActive() const { + return stall_active_.load(std::memory_order_relaxed); + } + + // Returns true if stalling condition is met. + bool IsStallThresholdExceeded() { return memory_usage() >= buffer_size_; } + + void ReserveMem(size_t mem); + // We are in the process of freeing `mem` bytes, so it is not considered // when checking the soft limit. - void ScheduleFreeMem(size_t mem) { - if (enabled()) { - memory_active_.fetch_sub(mem, std::memory_order_relaxed); - } - } - void FreeMem(size_t mem) { - if (cache_rep_ != nullptr) { - FreeMemWithCache(mem); - } else if (enabled()) { - memory_used_.fetch_sub(mem, std::memory_order_relaxed); - } - } + void ScheduleFreeMem(size_t mem); + + void FreeMem(size_t mem); + + // Add the DB instance to the queue and block the DB. + // Should only be called by RocksDB internally. + void BeginWriteStall(StallInterface* wbm_stall); + + // Remove DB instances from queue and signal them to continue. + void EndWriteStall(); + + void RemoveDBFromQueue(StallInterface* wbm_stall); private: - const size_t buffer_size_; - const size_t mutable_limit_; + std::atomic buffer_size_; + std::atomic mutable_limit_; std::atomic memory_used_; // Memory that hasn't been scheduled to free. std::atomic memory_active_; + std::atomic dummy_size_; struct CacheRep; std::unique_ptr cache_rep_; + std::list queue_; + // Protects the queue_ + std::mutex mu_; + bool allow_stall_; + std::atomic stall_active_; void ReserveMemWithCache(size_t mem); void FreeMemWithCache(size_t mem); diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index ebc1c197092..3ec6099fc26 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -13,6 +13,7 @@ set(JNI_NATIVE_SOURCES rocksjni/cassandra_value_operator.cc rocksjni/checkpoint.cc rocksjni/clock_cache.cc + rocksjni/cache.cc rocksjni/columnfamilyhandle.cc rocksjni/compaction_filter.cc rocksjni/compaction_filter_factory.cc @@ -30,6 +31,8 @@ set(JNI_NATIVE_SOURCES rocksjni/config_options.cc rocksjni/env.cc rocksjni/env_options.cc + rocksjni/event_listener.cc + rocksjni/event_listener_jnicallback.cc rocksjni/filter.cc rocksjni/ingest_external_file_options.cc rocksjni/iterator.cc @@ -63,6 +66,7 @@ set(JNI_NATIVE_SOURCES rocksjni/table.cc rocksjni/table_filter.cc rocksjni/table_filter_jnicallback.cc + rocksjni/testable_event_listener.cc rocksjni/thread_status.cc rocksjni/trace_writer.cc rocksjni/trace_writer_jnicallback.cc @@ -87,6 +91,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/AbstractCompactionFilter.java src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java src/main/java/org/rocksdb/AbstractComparator.java + src/main/java/org/rocksdb/AbstractEventListener.java src/main/java/org/rocksdb/AbstractImmutableNativeReference.java src/main/java/org/rocksdb/AbstractMutableOptions.java src/main/java/org/rocksdb/AbstractNativeReference.java @@ -100,6 +105,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/AccessHint.java src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java + src/main/java/org/rocksdb/BackgroundErrorReason.java src/main/java/org/rocksdb/BackupableDBOptions.java src/main/java/org/rocksdb/BackupEngine.java src/main/java/org/rocksdb/BackupInfo.java @@ -140,8 +146,13 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/EncodingType.java src/main/java/org/rocksdb/Env.java src/main/java/org/rocksdb/EnvOptions.java + src/main/java/org/rocksdb/EventListener.java src/main/java/org/rocksdb/Experimental.java + src/main/java/org/rocksdb/ExternalFileIngestionInfo.java src/main/java/org/rocksdb/Filter.java + src/main/java/org/rocksdb/FileOperationInfo.java + src/main/java/org/rocksdb/FlushJobInfo.java + src/main/java/org/rocksdb/FlushReason.java src/main/java/org/rocksdb/FlushOptions.java src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java src/main/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -163,6 +174,7 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/MemoryUsageType.java src/main/java/org/rocksdb/MemoryUtil.java src/main/java/org/rocksdb/MemTableConfig.java + src/main/java/org/rocksdb/MemTableInfo.java src/main/java/org/rocksdb/MergeOperator.java src/main/java/org/rocksdb/MutableColumnFamilyOptions.java src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java @@ -218,6 +230,10 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/StatsLevel.java src/main/java/org/rocksdb/Status.java src/main/java/org/rocksdb/StringAppendOperator.java + src/main/java/org/rocksdb/TableFileCreationBriefInfo.java + src/main/java/org/rocksdb/TableFileCreationInfo.java + src/main/java/org/rocksdb/TableFileCreationReason.java + src/main/java/org/rocksdb/TableFileDeletionInfo.java src/main/java/org/rocksdb/TableFilter.java src/main/java/org/rocksdb/TableProperties.java src/main/java/org/rocksdb/TableFormatConfig.java @@ -247,6 +263,8 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/WriteBatchWithIndex.java src/main/java/org/rocksdb/WriteOptions.java src/main/java/org/rocksdb/WriteBufferManager.java + src/main/java/org/rocksdb/WriteStallCondition.java + src/main/java/org/rocksdb/WriteStallInfo.java src/main/java/org/rocksdb/util/ByteUtil.java src/main/java/org/rocksdb/util/BytewiseComparator.java src/main/java/org/rocksdb/util/Environment.java @@ -267,6 +285,7 @@ set(JAVA_TEST_CLASSES src/test/java/org/rocksdb/WriteBatchTest.java src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java src/test/java/org/rocksdb/util/WriteBatchGetter.java + src/test/java/org/rocksdb/test/TestableEventListener.java ) include(FindJava) @@ -391,6 +410,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" org.rocksdb.AbstractCompactionFilter org.rocksdb.AbstractCompactionFilterFactory org.rocksdb.AbstractComparator + org.rocksdb.AbstractEventListener org.rocksdb.AbstractImmutableNativeReference org.rocksdb.AbstractNativeReference org.rocksdb.AbstractRocksIterator @@ -407,6 +427,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" org.rocksdb.CassandraValueMergeOperator org.rocksdb.Checkpoint org.rocksdb.ClockCache + org.rocksdb.Cache org.rocksdb.ColumnFamilyHandle org.rocksdb.ColumnFamilyOptions org.rocksdb.CompactionJobInfo @@ -417,6 +438,8 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" org.rocksdb.CompactRangeOptions org.rocksdb.ComparatorOptions org.rocksdb.CompressionOptions + org.rocksdb.ConcurrentTaskLimiterImpl + org.rocksdb.ConfigOptions org.rocksdb.DBOptions org.rocksdb.DirectSlice org.rocksdb.Env @@ -485,6 +508,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" org.rocksdb.WriteBatchTest org.rocksdb.WriteBatchTestInternalHelper org.rocksdb.WriteBufferManager + org.rocksdb.test.TestableEventListener ) create_javah( diff --git a/java/Makefile b/java/Makefile index c391a9bd235..1c702371222 100644 --- a/java/Makefile +++ b/java/Makefile @@ -2,6 +2,7 @@ NATIVE_JAVA_CLASSES = \ org.rocksdb.AbstractCompactionFilter\ org.rocksdb.AbstractCompactionFilterFactory\ org.rocksdb.AbstractComparator\ + org.rocksdb.AbstractEventListener\ org.rocksdb.AbstractSlice\ org.rocksdb.AbstractTableFilter\ org.rocksdb.AbstractTraceWriter\ @@ -13,6 +14,7 @@ NATIVE_JAVA_CLASSES = \ org.rocksdb.BloomFilter\ org.rocksdb.Checkpoint\ org.rocksdb.ClockCache\ + org.rocksdb.Cache\ org.rocksdb.CassandraCompactionFilter\ org.rocksdb.CassandraValueMergeOperator\ org.rocksdb.ColumnFamilyHandle\ @@ -87,7 +89,9 @@ NATIVE_JAVA_CLASSES = \ org.rocksdb.WriteBufferManager\ org.rocksdb.WBWIRocksIterator -NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\ +NATIVE_JAVA_TEST_CLASSES = \ + org.rocksdb.RocksDBExceptionTest\ + org.rocksdb.test.TestableEventListener\ org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\ org.rocksdb.WriteBatchTest\ org.rocksdb.WriteBatchTestInternalHelper @@ -98,10 +102,7 @@ ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h NATIVE_INCLUDE = ./include ARCH := $(shell getconf LONG_BIT) -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar -ifeq ($(PLATFORM), OS_MACOSX) -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -endif +SHA256_CMD ?= sha256sum JAVA_TESTS = \ org.rocksdb.BackupableDBOptionsTest\ @@ -129,6 +130,7 @@ JAVA_TESTS = \ org.rocksdb.DirectSliceTest\ org.rocksdb.util.EnvironmentTest\ org.rocksdb.EnvOptionsTest\ + org.rocksdb.EventListenerTest\ org.rocksdb.HdfsEnvTest\ org.rocksdb.IngestExternalFileOptionsTest\ org.rocksdb.util.IntComparatorTest\ @@ -205,12 +207,27 @@ SAMPLES_OUTPUT = samples/target SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes JAVA_TEST_LIBDIR = test-libs -JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar -JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar -JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar -JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar -JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar -JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR) +JAVA_JUNIT_VER = 4.13.1 +JAVA_JUNIT_SHA256 = c30719db974d6452793fe191b3638a5777005485bae145924044530ffa5f6122 +JAVA_JUNIT_JAR = junit-$(JAVA_JUNIT_VER).jar +JAVA_JUNIT_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_JUNIT_JAR) +JAVA_HAMCREST_VER = 2.2 +JAVA_HAMCREST_SHA256 = 5e62846a89f05cd78cd9c1a553f340d002458380c320455dd1f8fc5497a8a1c1 +JAVA_HAMCREST_JAR = hamcrest-$(JAVA_HAMCREST_VER).jar +JAVA_HAMCREST_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_HAMCREST_JAR) +JAVA_MOCKITO_VER = 1.10.19 +JAVA_MOCKITO_SHA256 = d1a7a7ef14b3db5c0fc3e0a63a81b374b510afe85add9f7984b97911f4c70605 +JAVA_MOCKITO_JAR = mockito-all-$(JAVA_MOCKITO_VER).jar +JAVA_MOCKITO_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_MOCKITO_JAR) +JAVA_CGLIB_VER = 3.3.0 +JAVA_CGLIB_SHA256 = 9fe0c26d7464140ccdfe019ac687be1fb906122b508ab54beb810db0f09a9212 +JAVA_CGLIB_JAR = cglib-$(JAVA_CGLIB_VER).jar +JAVA_CGLIB_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_CGLIB_JAR) +JAVA_ASSERTJ_VER = 2.9.0 +JAVA_ASSERTJ_SHA256 = 5e88ea3ecbe3c48aa1346fec76c84979fa9c8d22499f11479011691230e8babf +JAVA_ASSERTJ_JAR = assertj-core-$(JAVA_ASSERTJ_VER).jar +JAVA_ASSERTJ_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_ASSERTJ_JAR) +JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR_PATH):$(JAVA_HAMCREST_JAR_PATH):$(JAVA_MOCKITO_JAR_PATH):$(JAVA_CGLIB_JAR_PATH):$(JAVA_ASSERTJ_JAR_PATH) MVN_LOCAL = ~/.m2/repository @@ -294,13 +311,70 @@ optimistic_transaction_sample: java java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni $(AM_V_at)@rm -rf /tmp/rocksdbjni -resolve_test_deps: - test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)" - test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_JUNIT_JAR) --location $(DEPS_URL)/junit-4.12.jar - test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_HAMCR_JAR) --location $(DEPS_URL)/hamcrest-core-1.3.jar - test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_MOCKITO_JAR)" --location $(DEPS_URL)/mockito-all-1.10.19.jar - test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_CGLIB_JAR)" --location $(DEPS_URL)/cglib-2.2.2.jar - test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR)" --location $(DEPS_URL)/assertj-core-1.7.1.jar +$(JAVA_TEST_LIBDIR): + mkdir -p "$(JAVA_TEST_LIBDIR)" + +$(JAVA_JUNIT_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR))) + cp -v $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output $(JAVA_JUNIT_JAR_PATH) --location $(DEPS_URL)/$(JAVA_JUNIT_JAR) + JAVA_JUNIT_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_JUNIT_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_JUNIT_SHA256)" != "$$JAVA_JUNIT_SHA256_ACTUAL" ]; then \ + echo $(JAVA_JUNIT_JAR_PATH) checksum mismatch, expected=\"$(JAVA_JUNIT_SHA256)\" actual=\"$$JAVA_JUNIT_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_HAMCREST_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR))) + cp -v $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output $(JAVA_HAMCREST_JAR_PATH) --location $(DEPS_URL)/$(JAVA_HAMCREST_JAR) + JAVA_HAMCREST_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_HAMCREST_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_HAMCREST_SHA256)" != "$$JAVA_HAMCREST_SHA256_ACTUAL" ]; then \ + echo $(JAVA_HAMCREST_JAR_PATH) checksum mismatch, expected=\"$(JAVA_HAMCREST_SHA256)\" actual=\"$$JAVA_HAMCREST_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_MOCKITO_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR))) + cp -v $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output "$(JAVA_MOCKITO_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_MOCKITO_JAR) + JAVA_MOCKITO_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_MOCKITO_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_MOCKITO_SHA256)" != "$$JAVA_MOCKITO_SHA256_ACTUAL" ]; then \ + echo $(JAVA_MOCKITO_JAR_PATH) checksum mismatch, expected=\"$(JAVA_MOCKITO_SHA256)\" actual=\"$$JAVA_MOCKITO_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_CGLIB_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR))) + cp -v $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output "$(JAVA_CGLIB_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_CGLIB_JAR) + JAVA_CGLIB_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_CGLIB_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_CGLIB_SHA256)" != "$$JAVA_CGLIB_SHA256_ACTUAL" ]; then \ + echo $(JAVA_CGLIB_JAR_PATH) checksum mismatch, expected=\"$(JAVA_CGLIB_SHA256)\" actual=\"$$JAVA_CGLIB_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +$(JAVA_ASSERTJ_JAR_PATH): $(JAVA_TEST_LIBDIR) +ifneq (,$(wildcard $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR))) + cp -v $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR) $(JAVA_TEST_LIBDIR) +else + curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_ASSERTJ_JAR) + JAVA_ASSERTJ_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_ASSERTJ_JAR_PATH) | cut -d ' ' -f 1`; \ + if [ "$(JAVA_ASSERTJ_SHA256)" != "$$JAVA_ASSERTJ_SHA256_ACTUAL" ]; then \ + echo $(JAVA_ASSERTJ_JAR_PATH) checksum mismatch, expected=\"$(JAVA_ASSERTJ_SHA256)\" actual=\"$$JAVA_ASSERTJ_SHA256_ACTUAL\"; \ + exit 1; \ + fi +endif + +resolve_test_deps: $(JAVA_JUNIT_JAR_PATH) $(JAVA_HAMCREST_JAR_PATH) $(JAVA_MOCKITO_JAR_PATH) $(JAVA_CGLIB_JAR_PATH) $(JAVA_ASSERTJ_JAR_PATH) java_test: java resolve_test_deps $(AM_V_GEN)mkdir -p $(TEST_CLASSES) diff --git a/java/pom.xml.template b/java/pom.xml.template index 4cd70458a87..4abff4768e4 100644 --- a/java/pom.xml.template +++ b/java/pom.xml.template @@ -147,13 +147,25 @@ junit junit - 4.12 + 4.13.1 + test + + + org.hamcrest + hamcrest + 2.2 + test + + + cglib + cglib + 3.3.0 test org.assertj assertj-core - 1.7.1 + 2.9.0 test diff --git a/java/rocksjni/cache.cc b/java/rocksjni/cache.cc new file mode 100644 index 00000000000..33c0a2f0be7 --- /dev/null +++ b/java/rocksjni/cache.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::Cache. + +#include "rocksdb/cache.h" + +#include + +#include "include/org_rocksdb_Cache.h" + +/* + * Class: org_rocksdb_Cache + * Method: getUsage + * Signature: (J)J + */ +jlong Java_org_rocksdb_Cache_getUsage(JNIEnv*, jclass, jlong jhandle) { + auto* sptr_cache = + reinterpret_cast*>(jhandle); + return static_cast(sptr_cache->get()->GetUsage()); +} + +/* + * Class: org_rocksdb_Cache + * Method: getPinnedUsage + * Signature: (J)J + */ +jlong Java_org_rocksdb_Cache_getPinnedUsage(JNIEnv*, jclass, jlong jhandle) { + auto* sptr_cache = + reinterpret_cast*>(jhandle); + return static_cast(sptr_cache->get()->GetPinnedUsage()); +} diff --git a/java/rocksjni/compression_options.cc b/java/rocksjni/compression_options.cc index 4fed5ba5f9c..1857faf6807 100644 --- a/java/rocksjni/compression_options.cc +++ b/java/rocksjni/compression_options.cc @@ -132,6 +132,27 @@ jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes( return static_cast(opt->zstd_max_train_bytes); } +/* + * Class: org_rocksdb_CompressionOptions + * Method: setMaxDictBufferBytes + * Signature: (JJ)V + */ +void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes( + JNIEnv*, jobject, jlong jhandle, jlong jmax_dict_buffer_bytes) { + auto* opt = reinterpret_cast(jhandle); + opt->max_dict_buffer_bytes = static_cast(jmax_dict_buffer_bytes); +} + +/* + * Class: org_rocksdb_CompressionOptions + * Method: maxDictBufferBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return static_cast(opt->max_dict_buffer_bytes); +} /* * Class: org_rocksdb_CompressionOptions * Method: setEnabled diff --git a/java/rocksjni/concurrent_task_limiter.cc b/java/rocksjni/concurrent_task_limiter.cc index ddcdda478e3..591f721a2f6 100644 --- a/java/rocksjni/concurrent_task_limiter.cc +++ b/java/rocksjni/concurrent_task_limiter.cc @@ -15,7 +15,7 @@ */ jlong Java_org_rocksdb_ConcurrentTaskLimiterImpl_newConcurrentTaskLimiterImpl0( JNIEnv* env, jclass, jstring jname, jint limit) { - jboolean has_exception; + jboolean has_exception = JNI_FALSE; std::string name = ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jname, &has_exception); if (JNI_TRUE == has_exception) { diff --git a/java/rocksjni/event_listener.cc b/java/rocksjni/event_listener.cc new file mode 100644 index 00000000000..5b07ffc9be0 --- /dev/null +++ b/java/rocksjni/event_listener.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::EventListener. + +#include + +#include + +#include "include/org_rocksdb_AbstractEventListener.h" +#include "rocksjni/event_listener_jnicallback.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_AbstractEventListener + * Method: createNewEventListener + * Signature: (J)J + */ +jlong Java_org_rocksdb_AbstractEventListener_createNewEventListener( + JNIEnv* env, jobject jobj, jlong jenabled_event_callback_values) { + auto enabled_event_callbacks = + ROCKSDB_NAMESPACE::EnabledEventCallbackJni::toCppEnabledEventCallbacks( + jenabled_event_callback_values); + auto* sptr_event_listener = + new std::shared_ptr( + new ROCKSDB_NAMESPACE::EventListenerJniCallback( + env, jobj, enabled_event_callbacks)); + return reinterpret_cast(sptr_event_listener); +} + +/* + * Class: org_rocksdb_AbstractEventListener + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_AbstractEventListener_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { + delete reinterpret_cast*>( + jhandle); +} diff --git a/java/rocksjni/event_listener_jnicallback.cc b/java/rocksjni/event_listener_jnicallback.cc new file mode 100644 index 00000000000..6e4d8975edf --- /dev/null +++ b/java/rocksjni/event_listener_jnicallback.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::EventListener. + +#include "rocksjni/event_listener_jnicallback.h" + +#include "rocksjni/portal.h" + +namespace rocksdb { +EventListenerJniCallback::EventListenerJniCallback( + JNIEnv* env, jobject jevent_listener, + const std::set& enabled_event_callbacks) + : JniCallback(env, jevent_listener), + m_enabled_event_callbacks(enabled_event_callbacks) { + InitCallbackMethodId( + m_on_flush_completed_proxy_mid, EnabledEventCallback::ON_FLUSH_COMPLETED, + env, AbstractEventListenerJni::getOnFlushCompletedProxyMethodId); + + InitCallbackMethodId(m_on_flush_begin_proxy_mid, + EnabledEventCallback::ON_FLUSH_BEGIN, env, + AbstractEventListenerJni::getOnFlushBeginProxyMethodId); + + InitCallbackMethodId(m_on_table_file_deleted_mid, + EnabledEventCallback::ON_TABLE_FILE_DELETED, env, + AbstractEventListenerJni::getOnTableFileDeletedMethodId); + + InitCallbackMethodId( + m_on_compaction_begin_proxy_mid, + EnabledEventCallback::ON_COMPACTION_BEGIN, env, + AbstractEventListenerJni::getOnCompactionBeginProxyMethodId); + + InitCallbackMethodId( + m_on_compaction_completed_proxy_mid, + EnabledEventCallback::ON_COMPACTION_COMPLETED, env, + AbstractEventListenerJni::getOnCompactionCompletedProxyMethodId); + + InitCallbackMethodId(m_on_table_file_created_mid, + EnabledEventCallback::ON_TABLE_FILE_CREATED, env, + AbstractEventListenerJni::getOnTableFileCreatedMethodId); + + InitCallbackMethodId( + m_on_table_file_creation_started_mid, + EnabledEventCallback::ON_TABLE_FILE_CREATION_STARTED, env, + AbstractEventListenerJni::getOnTableFileCreationStartedMethodId); + + InitCallbackMethodId(m_on_mem_table_sealed_mid, + EnabledEventCallback::ON_MEMTABLE_SEALED, env, + AbstractEventListenerJni::getOnMemTableSealedMethodId); + + InitCallbackMethodId( + m_on_column_family_handle_deletion_started_mid, + EnabledEventCallback::ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, env, + AbstractEventListenerJni::getOnColumnFamilyHandleDeletionStartedMethodId); + + InitCallbackMethodId( + m_on_external_file_ingested_proxy_mid, + EnabledEventCallback::ON_EXTERNAL_FILE_INGESTED, env, + AbstractEventListenerJni::getOnExternalFileIngestedProxyMethodId); + + InitCallbackMethodId( + m_on_background_error_proxy_mid, + EnabledEventCallback::ON_BACKGROUND_ERROR, env, + AbstractEventListenerJni::getOnBackgroundErrorProxyMethodId); + + InitCallbackMethodId( + m_on_stall_conditions_changed_mid, + EnabledEventCallback::ON_STALL_CONDITIONS_CHANGED, env, + AbstractEventListenerJni::getOnStallConditionsChangedMethodId); + + InitCallbackMethodId(m_on_file_read_finish_mid, + EnabledEventCallback::ON_FILE_READ_FINISH, env, + AbstractEventListenerJni::getOnFileReadFinishMethodId); + + InitCallbackMethodId(m_on_file_write_finish_mid, + EnabledEventCallback::ON_FILE_WRITE_FINISH, env, + AbstractEventListenerJni::getOnFileWriteFinishMethodId); + + InitCallbackMethodId(m_on_file_flush_finish_mid, + EnabledEventCallback::ON_FILE_FLUSH_FINISH, env, + AbstractEventListenerJni::getOnFileFlushFinishMethodId); + + InitCallbackMethodId(m_on_file_sync_finish_mid, + EnabledEventCallback::ON_FILE_SYNC_FINISH, env, + AbstractEventListenerJni::getOnFileSyncFinishMethodId); + + InitCallbackMethodId( + m_on_file_range_sync_finish_mid, + EnabledEventCallback::ON_FILE_RANGE_SYNC_FINISH, env, + AbstractEventListenerJni::getOnFileRangeSyncFinishMethodId); + + InitCallbackMethodId( + m_on_file_truncate_finish_mid, + EnabledEventCallback::ON_FILE_TRUNCATE_FINISH, env, + AbstractEventListenerJni::getOnFileTruncateFinishMethodId); + + InitCallbackMethodId(m_on_file_close_finish_mid, + EnabledEventCallback::ON_FILE_CLOSE_FINISH, env, + AbstractEventListenerJni::getOnFileCloseFinishMethodId); + + InitCallbackMethodId( + m_should_be_notified_on_file_io, + EnabledEventCallback::SHOULD_BE_NOTIFIED_ON_FILE_IO, env, + AbstractEventListenerJni::getShouldBeNotifiedOnFileIOMethodId); + + InitCallbackMethodId( + m_on_error_recovery_begin_proxy_mid, + EnabledEventCallback::ON_ERROR_RECOVERY_BEGIN, env, + AbstractEventListenerJni::getOnErrorRecoveryBeginProxyMethodId); + + InitCallbackMethodId( + m_on_error_recovery_completed_mid, + EnabledEventCallback::ON_ERROR_RECOVERY_COMPLETED, env, + AbstractEventListenerJni::getOnErrorRecoveryCompletedMethodId); +} + +EventListenerJniCallback::~EventListenerJniCallback() {} + +void EventListenerJniCallback::OnFlushCompleted( + DB* db, const FlushJobInfo& flush_job_info) { + if (m_on_flush_completed_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jflush_job_info = SetupCallbackInvocation( + env, attached_thread, flush_job_info, + FlushJobInfoJni::fromCppFlushJobInfo); + + if (jflush_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_flush_completed_proxy_mid, + reinterpret_cast(db), jflush_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info}); +} + +void EventListenerJniCallback::OnFlushBegin( + DB* db, const FlushJobInfo& flush_job_info) { + if (m_on_flush_begin_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jflush_job_info = SetupCallbackInvocation( + env, attached_thread, flush_job_info, + FlushJobInfoJni::fromCppFlushJobInfo); + + if (jflush_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_flush_begin_proxy_mid, + reinterpret_cast(db), jflush_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info}); +} + +void EventListenerJniCallback::OnTableFileDeleted( + const TableFileDeletionInfo& info) { + if (m_on_table_file_deleted_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jdeletion_info = SetupCallbackInvocation( + env, attached_thread, info, + TableFileDeletionInfoJni::fromCppTableFileDeletionInfo); + + if (jdeletion_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_table_file_deleted_mid, + jdeletion_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jdeletion_info}); +} + +void EventListenerJniCallback::OnCompactionBegin(DB* db, + const CompactionJobInfo& ci) { + if (m_on_compaction_begin_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcompaction_job_info = SetupCallbackInvocation( + env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo); + + if (jcompaction_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_compaction_begin_proxy_mid, + reinterpret_cast(db), jcompaction_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info}); +} + +void EventListenerJniCallback::OnCompactionCompleted( + DB* db, const CompactionJobInfo& ci) { + if (m_on_compaction_completed_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcompaction_job_info = SetupCallbackInvocation( + env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo); + + if (jcompaction_job_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_compaction_completed_proxy_mid, + reinterpret_cast(db), jcompaction_job_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info}); +} + +void EventListenerJniCallback::OnTableFileCreated( + const TableFileCreationInfo& info) { + if (m_on_table_file_created_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jfile_creation_info = SetupCallbackInvocation( + env, attached_thread, info, + TableFileCreationInfoJni::fromCppTableFileCreationInfo); + + if (jfile_creation_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_table_file_created_mid, + jfile_creation_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jfile_creation_info}); +} + +void EventListenerJniCallback::OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info) { + if (m_on_table_file_creation_started_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcreation_brief_info = + SetupCallbackInvocation( + env, attached_thread, info, + TableFileCreationBriefInfoJni::fromCppTableFileCreationBriefInfo); + + if (jcreation_brief_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_table_file_creation_started_mid, + jcreation_brief_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcreation_brief_info}); +} + +void EventListenerJniCallback::OnMemTableSealed(const MemTableInfo& info) { + if (m_on_mem_table_sealed_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jmem_table_info = SetupCallbackInvocation( + env, attached_thread, info, MemTableInfoJni::fromCppMemTableInfo); + + if (jmem_table_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_mem_table_sealed_mid, + jmem_table_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jmem_table_info}); +} + +void EventListenerJniCallback::OnColumnFamilyHandleDeletionStarted( + ColumnFamilyHandle* handle) { + if (m_on_column_family_handle_deletion_started_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jcf_handle = SetupCallbackInvocation( + env, attached_thread, *handle, + ColumnFamilyHandleJni::fromCppColumnFamilyHandle); + + if (jcf_handle != nullptr) { + env->CallVoidMethod(m_jcallback_obj, + m_on_column_family_handle_deletion_started_mid, + jcf_handle); + } + + CleanupCallbackInvocation(env, attached_thread, {&jcf_handle}); +} + +void EventListenerJniCallback::OnExternalFileIngested( + DB* db, const ExternalFileIngestionInfo& info) { + if (m_on_external_file_ingested_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jingestion_info = SetupCallbackInvocation( + env, attached_thread, info, + ExternalFileIngestionInfoJni::fromCppExternalFileIngestionInfo); + + if (jingestion_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_external_file_ingested_proxy_mid, + reinterpret_cast(db), jingestion_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jingestion_info}); +} + +void EventListenerJniCallback::OnBackgroundError(BackgroundErrorReason reason, + Status* bg_error) { + if (m_on_background_error_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jstatus = SetupCallbackInvocation( + env, attached_thread, *bg_error, StatusJni::construct); + + if (jstatus != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_background_error_proxy_mid, + static_cast(reason), jstatus); + } + + CleanupCallbackInvocation(env, attached_thread, {&jstatus}); +} + +void EventListenerJniCallback::OnStallConditionsChanged( + const WriteStallInfo& info) { + if (m_on_stall_conditions_changed_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jwrite_stall_info = SetupCallbackInvocation( + env, attached_thread, info, WriteStallInfoJni::fromCppWriteStallInfo); + + if (jwrite_stall_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_stall_conditions_changed_mid, + jwrite_stall_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jwrite_stall_info}); +} + +void EventListenerJniCallback::OnFileReadFinish(const FileOperationInfo& info) { + OnFileOperation(m_on_file_read_finish_mid, info); +} + +void EventListenerJniCallback::OnFileWriteFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_write_finish_mid, info); +} + +void EventListenerJniCallback::OnFileFlushFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_flush_finish_mid, info); +} + +void EventListenerJniCallback::OnFileSyncFinish(const FileOperationInfo& info) { + OnFileOperation(m_on_file_sync_finish_mid, info); +} + +void EventListenerJniCallback::OnFileRangeSyncFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_range_sync_finish_mid, info); +} + +void EventListenerJniCallback::OnFileTruncateFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_truncate_finish_mid, info); +} + +void EventListenerJniCallback::OnFileCloseFinish( + const FileOperationInfo& info) { + OnFileOperation(m_on_file_close_finish_mid, info); +} + +bool EventListenerJniCallback::ShouldBeNotifiedOnFileIO() { + if (m_should_be_notified_on_file_io == nullptr) { + return false; + } + + jboolean attached_thread = JNI_FALSE; + JNIEnv* env = getJniEnv(&attached_thread); + assert(env != nullptr); + + jboolean jshould_be_notified = + env->CallBooleanMethod(m_jcallback_obj, m_should_be_notified_on_file_io); + + CleanupCallbackInvocation(env, attached_thread, {}); + + return static_cast(jshould_be_notified); +} + +void EventListenerJniCallback::OnErrorRecoveryBegin( + BackgroundErrorReason reason, Status bg_error, bool* auto_recovery) { + if (m_on_error_recovery_begin_proxy_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jbg_error = SetupCallbackInvocation( + env, attached_thread, bg_error, StatusJni::construct); + + if (jbg_error != nullptr) { + jboolean jauto_recovery = env->CallBooleanMethod( + m_jcallback_obj, m_on_error_recovery_begin_proxy_mid, + static_cast(reason), jbg_error); + *auto_recovery = jauto_recovery == JNI_TRUE; + } + + CleanupCallbackInvocation(env, attached_thread, {&jbg_error}); +} + +void EventListenerJniCallback::OnErrorRecoveryCompleted(Status old_bg_error) { + if (m_on_error_recovery_completed_mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jold_bg_error = SetupCallbackInvocation( + env, attached_thread, old_bg_error, StatusJni::construct); + + if (jold_bg_error != nullptr) { + env->CallVoidMethod(m_jcallback_obj, m_on_error_recovery_completed_mid, + jold_bg_error); + } + + CleanupCallbackInvocation(env, attached_thread, {&jold_bg_error}); +} + +void EventListenerJniCallback::InitCallbackMethodId( + jmethodID& mid, EnabledEventCallback eec, JNIEnv* env, + jmethodID (*get_id)(JNIEnv* env)) { + if (m_enabled_event_callbacks.count(eec) == 1) { + mid = get_id(env); + } else { + mid = nullptr; + } +} + +template +jobject EventListenerJniCallback::SetupCallbackInvocation( + JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj, + jobject (*convert)(JNIEnv* env, const T* cpp_obj)) { + attached_thread = JNI_FALSE; + env = getJniEnv(&attached_thread); + assert(env != nullptr); + + return convert(env, &cpp_obj); +} + +void EventListenerJniCallback::CleanupCallbackInvocation( + JNIEnv* env, jboolean attached_thread, + std::initializer_list refs) { + for (auto* ref : refs) { + if (*ref == nullptr) continue; + env->DeleteLocalRef(*ref); + } + + if (env->ExceptionCheck()) { + // exception thrown from CallVoidMethod + env->ExceptionDescribe(); // print out exception to stderr + } + + releaseJniEnv(attached_thread); +} + +void EventListenerJniCallback::OnFileOperation(const jmethodID& mid, + const FileOperationInfo& info) { + if (mid == nullptr) { + return; + } + + JNIEnv* env; + jboolean attached_thread; + jobject jop_info = SetupCallbackInvocation( + env, attached_thread, info, + FileOperationInfoJni::fromCppFileOperationInfo); + + if (jop_info != nullptr) { + env->CallVoidMethod(m_jcallback_obj, mid, jop_info); + } + + CleanupCallbackInvocation(env, attached_thread, {&jop_info}); +} +} // namespace rocksdb diff --git a/java/rocksjni/event_listener_jnicallback.h b/java/rocksjni/event_listener_jnicallback.h new file mode 100644 index 00000000000..e3b5d0e940b --- /dev/null +++ b/java/rocksjni/event_listener_jnicallback.h @@ -0,0 +1,122 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::EventListener. + +#ifndef JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ +#define JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ + +#include + +#include +#include + +#include "rocksdb/listener.h" +#include "rocksjni/jnicallback.h" + +namespace rocksdb { + +enum EnabledEventCallback { + ON_FLUSH_COMPLETED = 0x0, + ON_FLUSH_BEGIN = 0x1, + ON_TABLE_FILE_DELETED = 0x2, + ON_COMPACTION_BEGIN = 0x3, + ON_COMPACTION_COMPLETED = 0x4, + ON_TABLE_FILE_CREATED = 0x5, + ON_TABLE_FILE_CREATION_STARTED = 0x6, + ON_MEMTABLE_SEALED = 0x7, + ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED = 0x8, + ON_EXTERNAL_FILE_INGESTED = 0x9, + ON_BACKGROUND_ERROR = 0xA, + ON_STALL_CONDITIONS_CHANGED = 0xB, + ON_FILE_READ_FINISH = 0xC, + ON_FILE_WRITE_FINISH = 0xD, + ON_FILE_FLUSH_FINISH = 0xE, + ON_FILE_SYNC_FINISH = 0xF, + ON_FILE_RANGE_SYNC_FINISH = 0x10, + ON_FILE_TRUNCATE_FINISH = 0x11, + ON_FILE_CLOSE_FINISH = 0x12, + SHOULD_BE_NOTIFIED_ON_FILE_IO = 0x13, + ON_ERROR_RECOVERY_BEGIN = 0x14, + ON_ERROR_RECOVERY_COMPLETED = 0x15, + + NUM_ENABLED_EVENT_CALLBACK = 0x16, +}; + +class EventListenerJniCallback : public JniCallback, public EventListener { + public: + EventListenerJniCallback( + JNIEnv* env, jobject jevent_listener, + const std::set& enabled_event_callbacks); + virtual ~EventListenerJniCallback(); + virtual void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info); + virtual void OnFlushBegin(DB* db, const FlushJobInfo& flush_job_info); + virtual void OnTableFileDeleted(const TableFileDeletionInfo& info); + virtual void OnCompactionBegin(DB* db, const CompactionJobInfo& ci); + virtual void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci); + virtual void OnTableFileCreated(const TableFileCreationInfo& info); + virtual void OnTableFileCreationStarted( + const TableFileCreationBriefInfo& info); + virtual void OnMemTableSealed(const MemTableInfo& info); + virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle); + virtual void OnExternalFileIngested(DB* db, + const ExternalFileIngestionInfo& info); + virtual void OnBackgroundError(BackgroundErrorReason reason, + Status* bg_error); + virtual void OnStallConditionsChanged(const WriteStallInfo& info); + virtual void OnFileReadFinish(const FileOperationInfo& info); + virtual void OnFileWriteFinish(const FileOperationInfo& info); + virtual void OnFileFlushFinish(const FileOperationInfo& info); + virtual void OnFileSyncFinish(const FileOperationInfo& info); + virtual void OnFileRangeSyncFinish(const FileOperationInfo& info); + virtual void OnFileTruncateFinish(const FileOperationInfo& info); + virtual void OnFileCloseFinish(const FileOperationInfo& info); + virtual bool ShouldBeNotifiedOnFileIO(); + virtual void OnErrorRecoveryBegin(BackgroundErrorReason reason, + Status bg_error, bool* auto_recovery); + virtual void OnErrorRecoveryCompleted(Status old_bg_error); + + private: + inline void InitCallbackMethodId(jmethodID& mid, EnabledEventCallback eec, + JNIEnv* env, + jmethodID (*get_id)(JNIEnv* env)); + template + inline jobject SetupCallbackInvocation( + JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj, + jobject (*convert)(JNIEnv* env, const T* cpp_obj)); + inline void CleanupCallbackInvocation(JNIEnv* env, jboolean attached_thread, + std::initializer_list refs); + inline void OnFileOperation(const jmethodID& mid, + const FileOperationInfo& info); + + const std::set m_enabled_event_callbacks; + jmethodID m_on_flush_completed_proxy_mid; + jmethodID m_on_flush_begin_proxy_mid; + jmethodID m_on_table_file_deleted_mid; + jmethodID m_on_compaction_begin_proxy_mid; + jmethodID m_on_compaction_completed_proxy_mid; + jmethodID m_on_table_file_created_mid; + jmethodID m_on_table_file_creation_started_mid; + jmethodID m_on_mem_table_sealed_mid; + jmethodID m_on_column_family_handle_deletion_started_mid; + jmethodID m_on_external_file_ingested_proxy_mid; + jmethodID m_on_background_error_proxy_mid; + jmethodID m_on_stall_conditions_changed_mid; + jmethodID m_on_file_read_finish_mid; + jmethodID m_on_file_write_finish_mid; + jmethodID m_on_file_flush_finish_mid; + jmethodID m_on_file_sync_finish_mid; + jmethodID m_on_file_range_sync_finish_mid; + jmethodID m_on_file_truncate_finish_mid; + jmethodID m_on_file_close_finish_mid; + jmethodID m_should_be_notified_on_file_io; + jmethodID m_on_error_recovery_begin_proxy_mid; + jmethodID m_on_error_recovery_completed_mid; +}; + +} // namespace rocksdb + +#endif // JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_ diff --git a/java/rocksjni/jnicallback.h b/java/rocksjni/jnicallback.h index b11ef2ef2cc..5baa8973c14 100644 --- a/java/rocksjni/jnicallback.h +++ b/java/rocksjni/jnicallback.h @@ -19,6 +19,8 @@ class JniCallback { JniCallback(JNIEnv* env, jobject jcallback_obj); virtual ~JniCallback(); + const jobject& GetJavaObject() const { return m_jcallback_obj; } + protected: JavaVM* m_jvm; jobject m_jcallback_obj; diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc index fac288c925c..07284d434fe 100644 --- a/java/rocksjni/memory_util.cc +++ b/java/rocksjni/memory_util.cc @@ -22,20 +22,14 @@ * Signature: ([J[J)Ljava/util/Map; */ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( - JNIEnv *env, jclass /*jclazz*/, jlongArray jdb_handles, jlongArray jcache_handles) { - std::vector dbs; - jsize db_handle_count = env->GetArrayLength(jdb_handles); - if(db_handle_count > 0) { - jlong *ptr_jdb_handles = env->GetLongArrayElements(jdb_handles, nullptr); - if (ptr_jdb_handles == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } - for (jsize i = 0; i < db_handle_count; i++) { - dbs.push_back( - reinterpret_cast(ptr_jdb_handles[i])); - } - env->ReleaseLongArrayElements(jdb_handles, ptr_jdb_handles, JNI_ABORT); + JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) { + jboolean has_exception = JNI_FALSE; + std::vector dbs = + ROCKSDB_NAMESPACE::JniUtil::fromJPointers( + env, jdb_handles, &has_exception); + if (has_exception == JNI_TRUE) { + // exception thrown: OutOfMemoryError + return nullptr; } std::unordered_set cache_set; @@ -103,5 +97,4 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( } return jusage_by_type; - } diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 62ff1a3e16d..fbf3241792e 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -553,7 +553,8 @@ jlong Java_org_rocksdb_Options_dbPathsLen( void Java_org_rocksdb_Options_dbPaths( JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { - jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr); + jboolean is_copy; + jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); if (ptr_jtarget_size == nullptr) { // exception thrown: OutOfMemoryError return; @@ -581,7 +582,8 @@ void Java_org_rocksdb_Options_dbPaths( ptr_jtarget_size[i] = static_cast(db_path.target_size); } - env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT); + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); } /* @@ -943,6 +945,7 @@ rocksdb_convert_cf_paths_from_java_helper(JNIEnv* env, jobjectArray path_array, jlong* size_array_ptr = env->GetLongArrayElements(size_array, nullptr); if (nullptr == size_array_ptr) { // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; return {}; } std::vector cf_paths; @@ -975,7 +978,7 @@ void Java_org_rocksdb_Options_setCfPaths(JNIEnv* env, jclass, jlong jhandle, jobjectArray path_array, jlongArray size_array) { auto* options = reinterpret_cast(jhandle); - jboolean has_exception; + jboolean has_exception = JNI_FALSE; std::vector cf_paths = rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array, &has_exception); @@ -1767,6 +1770,76 @@ jboolean Java_org_rocksdb_Options_strictBytesPerSync( return static_cast(opt->strict_bytes_per_sync); } +// Note: the RocksJava API currently only supports EventListeners implemented in +// Java. It could be extended in future to also support adding/removing +// EventListeners implemented in C++. +static void rocksdb_set_event_listeners_helper( + JNIEnv* env, jlongArray jlistener_array, + std::vector>& + listener_sptr_vec) { + jlong* ptr_jlistener_array = + env->GetLongArrayElements(jlistener_array, nullptr); + if (ptr_jlistener_array == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + const jsize array_size = env->GetArrayLength(jlistener_array); + listener_sptr_vec.clear(); + for (jsize i = 0; i < array_size; ++i) { + const auto& listener_sptr = + *reinterpret_cast*>( + ptr_jlistener_array[i]); + listener_sptr_vec.push_back(listener_sptr); + } +} + +/* + * Class: org_rocksdb_Options + * Method: setEventListeners + * Signature: (J[J)V + */ +void Java_org_rocksdb_Options_setEventListeners(JNIEnv* env, jclass, + jlong jhandle, + jlongArray jlistener_array) { + auto* opt = reinterpret_cast(jhandle); + rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners); +} + +// Note: the RocksJava API currently only supports EventListeners implemented in +// Java. It could be extended in future to also support adding/removing +// EventListeners implemented in C++. +static jobjectArray rocksdb_get_event_listeners_helper( + JNIEnv* env, + const std::vector>& + listener_sptr_vec) { + jsize sz = static_cast(listener_sptr_vec.size()); + jclass jlistener_clazz = + ROCKSDB_NAMESPACE::AbstractEventListenerJni::getJClass(env); + jobjectArray jlisteners = env->NewObjectArray(sz, jlistener_clazz, nullptr); + if (jlisteners == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + for (jsize i = 0; i < sz; ++i) { + const auto* jni_cb = + static_cast( + listener_sptr_vec[i].get()); + env->SetObjectArrayElement(jlisteners, i, jni_cb->GetJavaObject()); + } + return jlisteners; +} + +/* + * Class: org_rocksdb_Options + * Method: eventListeners + * Signature: (J)[Lorg/rocksdb/AbstractEventListener; + */ +jobjectArray Java_org_rocksdb_Options_eventListeners(JNIEnv* env, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return rocksdb_get_event_listeners_helper(env, opt->listeners); +} + /* * Class: org_rocksdb_Options * Method: setEnableThreadTracking @@ -4070,7 +4143,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCfPaths(JNIEnv* env, jclass, jlongArray size_array) { auto* options = reinterpret_cast(jhandle); - jboolean has_exception; + jboolean has_exception = JNI_FALSE; std::vector cf_paths = rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array, &has_exception); @@ -4959,8 +5032,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditiona JNIEnv* env, jobject, jlong jhandle, jintArray jmax_bytes_for_level_multiplier_additional) { jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional); - jint* additionals = - env->GetIntArrayElements(jmax_bytes_for_level_multiplier_additional, 0); + jint* additionals = env->GetIntArrayElements( + jmax_bytes_for_level_multiplier_additional, nullptr); if (additionals == nullptr) { // exception thrown: OutOfMemoryError return; @@ -5612,7 +5685,8 @@ jlong Java_org_rocksdb_DBOptions_dbPathsLen( void Java_org_rocksdb_DBOptions_dbPaths( JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, jlongArray jtarget_sizes) { - jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr); + jboolean is_copy; + jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); if (ptr_jtarget_size == nullptr) { // exception thrown: OutOfMemoryError return; @@ -5640,7 +5714,8 @@ void Java_org_rocksdb_DBOptions_dbPaths( ptr_jtarget_size[i] = static_cast(db_path.target_size); } - env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT); + env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); } /* @@ -6549,6 +6624,29 @@ jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync( ->strict_bytes_per_sync); } +/* + * Class: org_rocksdb_DBOptions + * Method: setEventListeners + * Signature: (J[J)V + */ +void Java_org_rocksdb_DBOptions_setEventListeners(JNIEnv* env, jclass, + jlong jhandle, + jlongArray jlistener_array) { + auto* opt = reinterpret_cast(jhandle); + rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: eventListeners + * Signature: (J)[Lorg/rocksdb/AbstractEventListener; + */ +jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass, + jlong jhandle) { + auto* opt = reinterpret_cast(jhandle); + return rocksdb_get_event_listeners_helper(env, opt->listeners); +} + /* * Class: org_rocksdb_DBOptions * Method: setDelayedWriteRate diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index e31da0f4fd3..3800c8324e4 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -10,14 +10,16 @@ #ifndef JAVA_ROCKSJNI_PORTAL_H_ #define JAVA_ROCKSJNI_PORTAL_H_ +#include + #include #include #include #include #include -#include #include #include +#include #include #include #include @@ -34,6 +36,7 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksjni/compaction_filter_factory_jnicallback.h" #include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/event_listener_jnicallback.h" #include "rocksjni/loggerjnicallback.h" #include "rocksjni/table_filter_jnicallback.h" #include "rocksjni/trace_writer_jnicallback.h" @@ -223,7 +226,7 @@ class CodeJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getValueMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -261,7 +264,7 @@ class SubCodeJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getValueMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -326,7 +329,7 @@ class StatusJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCodeMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -347,7 +350,7 @@ class StatusJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getSubCodeMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -368,7 +371,7 @@ class StatusJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getStateMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -438,6 +441,10 @@ class StatusJni return jstatus; } + static jobject construct(JNIEnv* env, const Status* status) { + return construct(env, *status); + } + // Returns the equivalent org.rocksdb.Status.Code for the provided // C++ ROCKSDB_NAMESPACE::Status::Code enum static jbyte toJavaStatusCode(const ROCKSDB_NAMESPACE::Status::Code& code) { @@ -934,7 +941,7 @@ class RocksDBExceptionJni : * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getStatusMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -1025,7 +1032,7 @@ class ListJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getIteratorMethod(JNIEnv* env) { jclass jlist_clazz = getListClass(env); @@ -1046,7 +1053,7 @@ class ListJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getHasNextMethod(JNIEnv* env) { jclass jiterator_clazz = getIteratorClass(env); @@ -1066,7 +1073,7 @@ class ListJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNextMethod(JNIEnv* env) { jclass jiterator_clazz = getIteratorClass(env); @@ -1087,7 +1094,7 @@ class ListJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getArrayListConstructorMethodId(JNIEnv* env) { jclass jarray_list_clazz = getArrayListClass(env); @@ -1107,7 +1114,7 @@ class ListJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getListAddMethodId(JNIEnv* env) { jclass jlist_clazz = getListClass(env); @@ -1244,10 +1251,11 @@ class ByteBufferJni : public JavaClass { * Get the Java Method: ByteBuffer#allocate * * @param env A pointer to the Java environment - * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or nullptr + * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or + * nullptr * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getAllocateMethodId(JNIEnv* env, jclass jbytebuffer_clazz = nullptr) { @@ -1270,7 +1278,7 @@ class ByteBufferJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getArrayMethodId(JNIEnv* env, jclass jbytebuffer_clazz = nullptr) { @@ -1479,7 +1487,7 @@ class StringBuilderJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getListAddMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2366,7 +2374,7 @@ class MapJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMapPutMethodId(JNIEnv* env) { jclass jlist_clazz = getJClass(env); @@ -2898,7 +2906,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getPutCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2918,7 +2926,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getPutMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2938,7 +2946,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMergeCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2958,7 +2966,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMergeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2978,7 +2986,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -2998,7 +3006,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3018,7 +3026,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3038,7 +3046,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getSingleDeleteMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3058,7 +3066,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3078,7 +3086,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getDeleteRangeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3098,7 +3106,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getLogDataMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3118,7 +3126,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3138,7 +3146,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3158,7 +3166,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3178,7 +3186,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkNoopMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3198,7 +3206,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkRollbackMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3218,7 +3226,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getMarkCommitMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3238,7 +3246,7 @@ class WriteBatchHandlerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getContinueMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3274,7 +3282,7 @@ class WriteBatchSavePointJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getConstructorMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3365,7 +3373,7 @@ class HistogramDataJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getConstructorMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3461,6 +3469,19 @@ class ColumnFamilyHandleJni : public RocksDBNativeClass { public: + static jobject fromCppColumnFamilyHandle( + JNIEnv* env, const ROCKSDB_NAMESPACE::ColumnFamilyHandle* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, reinterpret_cast(info)); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(J)V"); + } + /** * Get the Java Class org.rocksdb.ColumnFamilyHandle * @@ -3541,7 +3562,7 @@ class AbstractCompactionFilterFactoryJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3562,7 +3583,7 @@ class AbstractCompactionFilterFactoryJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3629,7 +3650,7 @@ class AbstractComparatorJniBridge : public JavaClass { * @param jclazz the AbstractComparatorJniBridge class * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = @@ -3646,7 +3667,7 @@ class AbstractComparatorJniBridge : public JavaClass { * @param jclazz the AbstractComparatorJniBridge class * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = @@ -3663,7 +3684,7 @@ class AbstractComparatorJniBridge : public JavaClass { * @param jclazz the AbstractComparatorJniBridge class * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = @@ -3699,7 +3720,7 @@ class AbstractComparatorJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -3996,7 +4017,7 @@ class WBWIRocksIteratorJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Field ID or nullptr if the class or field id could not - * be retieved + * be retrieved */ static jfieldID getWriteEntryField(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -4317,7 +4338,7 @@ class LoggerJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getLogMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -4961,7 +4982,24 @@ class TickerTypeJni { return -0x14; case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL: return -0x15; - + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT: + return -0x16; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT: + return -0x17; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT: + return -0x18; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT: + return -0x19; + case ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT: + return -0x1A; + case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT: + return -0x1B; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH: + return -0x1C; + case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH: + return -0x1D; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F for backwards compatibility on current minor version. return 0x5F; @@ -5273,6 +5311,25 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC; case -0x15: return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL; + case -0x16: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT; + case -0x17: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT; + case -0x18: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT; + case -0x19: + return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT; + case -0x1A: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT; + case -0x1B: + return ROCKSDB_NAMESPACE::Tickers:: + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT; + case -0x1C: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH; + case -0x1D: + return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH; case 0x5F: // 0x5F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX; @@ -5392,6 +5449,8 @@ class HistogramTypeJni { return 0x30; case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: return 0x31; + case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: + return 0x31; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5506,6 +5565,9 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL; case 0x31: return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; + case 0x32: + return ROCKSDB_NAMESPACE::Histograms:: + ERROR_HANDLER_AUTORESUME_RETRY_COUNT; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; @@ -5698,7 +5760,8 @@ class TransactionJni : public JavaClass { return nullptr; } - jlong *body = env->GetLongArrayElements(jtransaction_ids, nullptr); + jboolean is_copy; + jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy); if(body == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jkey); @@ -5708,7 +5771,8 @@ class TransactionJni : public JavaClass { for(size_t i = 0; i < len; ++i) { body[i] = static_cast(transaction_ids[i]); } - env->ReleaseLongArrayElements(jtransaction_ids, body, 0); + env->ReleaseLongArrayElements(jtransaction_ids, body, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); jobject jwaiting_transactions = env->CallObjectMethod(jtransaction, mid, static_cast(column_family_id), jkey, jtransaction_ids); @@ -5979,7 +6043,7 @@ class AbstractTableFilterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getFilterMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -6019,7 +6083,11 @@ class TablePropertiesJni : public JavaClass { return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(JJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V"); + jmethodID mid = env->GetMethodID( + jclazz, "", + "(JJJJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/" + "lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/" + "String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -6145,8 +6213,8 @@ class TablePropertiesJni : public JavaClass { return nullptr; } - jobject jtable_properties = env->NewObject(jclazz, mid, - static_cast(table_properties.data_size), + jobject jtable_properties = env->NewObject( + jclazz, mid, static_cast(table_properties.data_size), static_cast(table_properties.index_size), static_cast(table_properties.index_partitions), static_cast(table_properties.top_level_index_size), @@ -6165,17 +6233,14 @@ class TablePropertiesJni : public JavaClass { static_cast(table_properties.column_family_id), static_cast(table_properties.creation_time), static_cast(table_properties.oldest_key_time), - jcolumn_family_name, - jfilter_policy_name, - jcomparator_name, - jmerge_operator_name, - jprefix_extractor_name, - jproperty_collectors_names, - jcompression_name, - juser_collected_properties, - jreadable_properties, - jproperties_offsets - ); + static_cast( + table_properties.slow_compression_estimated_data_size), + static_cast( + table_properties.fast_compression_estimated_data_size), + jcolumn_family_name, jfilter_policy_name, jcomparator_name, + jmerge_operator_name, jprefix_extractor_name, + jproperty_collectors_names, jcompression_name, + juser_collected_properties, jreadable_properties, jproperties_offsets); if (env->ExceptionCheck()) { return nullptr; @@ -6249,7 +6314,7 @@ class ColumnFamilyDescriptorJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getColumnFamilyNameMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -6269,7 +6334,7 @@ class ColumnFamilyDescriptorJni : public JavaClass { * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -6763,7 +6828,8 @@ class ThreadStatusJni : public JavaClass { env->DeleteLocalRef(jcf_name); return nullptr; } - jlong *body = env->GetLongArrayElements(joperation_properties, nullptr); + jboolean is_copy; + jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy); if (body == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jdb_name); @@ -6774,7 +6840,8 @@ class ThreadStatusJni : public JavaClass { for (size_t i = 0; i < len; ++i) { body[i] = static_cast(thread_status->op_properties[i]); } - env->ReleaseLongArrayElements(joperation_properties, body, 0); + env->ReleaseLongArrayElements(joperation_properties, body, + is_copy == JNI_TRUE ? 0 : JNI_ABORT); jobject jcfd = env->NewObject(jclazz, mid, static_cast(thread_status->thread_id), @@ -7395,7 +7462,7 @@ class AbstractTraceWriterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getWriteProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7416,7 +7483,7 @@ class AbstractTraceWriterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7437,7 +7504,7 @@ class AbstractTraceWriterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getGetFileSizeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7478,7 +7545,7 @@ class AbstractWalFilterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7500,7 +7567,7 @@ class AbstractWalFilterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7521,7 +7588,7 @@ class AbstractWalFilterJni * @param env A pointer to the Java environment * * @return The Java Method ID or nullptr if the class or method id could not - * be retieved + * be retrieved */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); @@ -7659,5 +7726,760 @@ class SanityLevelJni { } } }; + +// The portal class for org.rocksdb.AbstractListener.EnabledEventCallback +class EnabledEventCallbackJni { + public: + // Returns the set of equivalent C++ + // rocksdb::EnabledEventCallbackJni::EnabledEventCallback enums for + // the provided Java jenabled_event_callback_values + static std::set toCppEnabledEventCallbacks( + jlong jenabled_event_callback_values) { + std::set enabled_event_callbacks; + for (size_t i = 0; i < EnabledEventCallback::NUM_ENABLED_EVENT_CALLBACK; + ++i) { + if (((1ULL << i) & jenabled_event_callback_values) > 0) { + enabled_event_callbacks.emplace(static_cast(i)); + } + } + return enabled_event_callbacks; + } +}; + +// The portal class for org.rocksdb.AbstractEventListener +class AbstractEventListenerJni + : public RocksDBNativeClass< + const ROCKSDB_NAMESPACE::EventListenerJniCallback*, + AbstractEventListenerJni> { + public: + /** + * Get the Java Class org.rocksdb.AbstractEventListener + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractEventListener"); + } + + /** + * Get the Java Method: AbstractEventListener#onFlushCompletedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFlushCompletedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onFlushCompletedProxy", + "(JLorg/rocksdb/FlushJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFlushBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFlushBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onFlushBeginProxy", + "(JLorg/rocksdb/FlushJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileDeleted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileDeletedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onTableFileDeleted", "(Lorg/rocksdb/TableFileDeletionInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onCompactionBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnCompactionBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onCompactionBeginProxy", + "(JLorg/rocksdb/CompactionJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onCompactionCompletedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnCompactionCompletedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onCompactionCompletedProxy", + "(JLorg/rocksdb/CompactionJobInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileCreated + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileCreatedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onTableFileCreated", "(Lorg/rocksdb/TableFileCreationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onTableFileCreationStarted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnTableFileCreationStartedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onTableFileCreationStarted", + "(Lorg/rocksdb/TableFileCreationBriefInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onMemTableSealed + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnMemTableSealedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onMemTableSealed", + "(Lorg/rocksdb/MemTableInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: + * AbstractEventListener#onColumnFamilyHandleDeletionStarted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnColumnFamilyHandleDeletionStartedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onColumnFamilyHandleDeletionStarted", + "(Lorg/rocksdb/ColumnFamilyHandle;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onExternalFileIngestedProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnExternalFileIngestedProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "onExternalFileIngestedProxy", + "(JLorg/rocksdb/ExternalFileIngestionInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onBackgroundError + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnBackgroundErrorProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onBackgroundErrorProxy", + "(BLorg/rocksdb/Status;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onStallConditionsChanged + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnStallConditionsChangedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onStallConditionsChanged", + "(Lorg/rocksdb/WriteStallInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileReadFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileReadFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileReadFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileWriteFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileWriteFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileWriteFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileFlushFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileFlushFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileFlushFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileSyncFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileSyncFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileRangeSyncFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileRangeSyncFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileRangeSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileTruncateFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileTruncateFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileTruncateFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onFileCloseFinish + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnFileCloseFinishMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID( + jclazz, "onFileCloseFinish", "(Lorg/rocksdb/FileOperationInfo;)V"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#shouldBeNotifiedOnFileIO + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getShouldBeNotifiedOnFileIOMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = + env->GetMethodID(jclazz, "shouldBeNotifiedOnFileIO", "()Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onErrorRecoveryBeginProxy + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnErrorRecoveryBeginProxyMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryBeginProxy", + "(BLorg/rocksdb/Status;)Z"); + assert(mid != nullptr); + return mid; + } + + /** + * Get the Java Method: AbstractEventListener#onErrorRecoveryCompleted + * + * @param env A pointer to the Java environment + * + * @return The Java Method ID + */ + static jmethodID getOnErrorRecoveryCompletedMethodId(JNIEnv* env) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryCompleted", + "(Lorg/rocksdb/Status;)V"); + assert(mid != nullptr); + return mid; + } +}; + +class FlushJobInfoJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.FlushJobInfo object. + * + * @param env A pointer to the Java environment + * @param flush_job_info A Cpp flush job info object + * + * @return A reference to a Java org.rocksdb.FlushJobInfo object, or + * nullptr if an an exception occurs + */ + static jobject fromCppFlushJobInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::FlushJobInfo* flush_job_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &flush_job_info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &flush_job_info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jfile_path); + return nullptr; + } + jobject jtable_properties = TablePropertiesJni::fromCppTableProperties( + env, flush_job_info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jfile_path); + return nullptr; + } + return env->NewObject( + jclazz, ctor, static_cast(flush_job_info->cf_id), jcf_name, + jfile_path, static_cast(flush_job_info->thread_id), + static_cast(flush_job_info->job_id), + static_cast(flush_job_info->triggered_writes_slowdown), + static_cast(flush_job_info->triggered_writes_stop), + static_cast(flush_job_info->smallest_seqno), + static_cast(flush_job_info->largest_seqno), jtable_properties, + static_cast(flush_job_info->flush_reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/FlushJobInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(JLjava/lang/String;Ljava/lang/String;JIZZJJLorg/" + "rocksdb/TableProperties;B)V"); + } +}; + +class TableFileDeletionInfoJni : public JavaClass { + public: + /** + * Create a new Java org.rocksdb.TableFileDeletionInfo object. + * + * @param env A pointer to the Java environment + * @param file_del_info A Cpp table file deletion info object + * + * @return A reference to a Java org.rocksdb.TableFileDeletionInfo object, or + * nullptr if an an exception occurs + */ + static jobject fromCppTableFileDeletionInfo( + JNIEnv* env, + const ROCKSDB_NAMESPACE::TableFileDeletionInfo* file_del_info) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &file_del_info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jobject jstatus = StatusJni::construct(env, file_del_info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + return env->NewObject(jclazz, ctor, jdb_name, + JniUtil::toJavaString(env, &file_del_info->file_path), + static_cast(file_del_info->job_id), jstatus); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileDeletionInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(Ljava/lang/String;Ljava/lang/String;ILorg/rocksdb/Status;)V"); + } +}; + +class CompactionJobInfoJni : public JavaClass { + public: + static jobject fromCppCompactionJobInfo( + JNIEnv* env, + const ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, + reinterpret_cast(compaction_job_info)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/CompactionJobInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(J)V"); + } +}; + +class TableFileCreationInfoJni : public JavaClass { + public: + static jobject fromCppTableFileCreationInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jobject jtable_properties = + TablePropertiesJni::fromCppTableProperties(env, info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jobject jstatus = StatusJni::construct(env, info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jtable_properties); + return nullptr; + } + return env->NewObject(jclazz, ctor, static_cast(info->file_size), + jtable_properties, jstatus, jdb_name, jcf_name, + jfile_path, static_cast(info->job_id), + static_cast(info->reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(JLorg/rocksdb/TableProperties;Lorg/rocksdb/Status;Ljava/lang/" + "String;Ljava/lang/String;Ljava/lang/String;IB)V"); + } +}; + +class TableFileCreationBriefInfoJni : public JavaClass { + public: + static jobject fromCppTableFileCreationBriefInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationBriefInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jdb_name = JniUtil::toJavaString(env, &info->db_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + return nullptr; + } + jstring jfile_path = JniUtil::toJavaString(env, &info->file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + return nullptr; + } + return env->NewObject(jclazz, ctor, jdb_name, jcf_name, jfile_path, + static_cast(info->job_id), + static_cast(info->reason)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationBriefInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID( + clazz, "", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IB)V"); + } +}; + +class MemTableInfoJni : public JavaClass { + public: + static jobject fromCppMemTableInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::MemTableInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + return env->NewObject(jclazz, ctor, jcf_name, + static_cast(info->first_seqno), + static_cast(info->earliest_seqno), + static_cast(info->num_entries), + static_cast(info->num_deletes)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/MemTableInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(Ljava/lang/String;JJJJ)V"); + } +}; + +class ExternalFileIngestionInfoJni : public JavaClass { + public: + static jobject fromCppExternalFileIngestionInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::ExternalFileIngestionInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + jstring jexternal_file_path = + JniUtil::toJavaString(env, &info->external_file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + return nullptr; + } + jstring jinternal_file_path = + JniUtil::toJavaString(env, &info->internal_file_path); + if (env->ExceptionCheck()) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jexternal_file_path); + return nullptr; + } + jobject jtable_properties = + TablePropertiesJni::fromCppTableProperties(env, info->table_properties); + if (jtable_properties == nullptr) { + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(jexternal_file_path); + env->DeleteLocalRef(jinternal_file_path); + return nullptr; + } + return env->NewObject( + jclazz, ctor, jcf_name, jexternal_file_path, jinternal_file_path, + static_cast(info->global_seqno), jtable_properties); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/ExternalFileIngestionInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/" + "String;JLorg/rocksdb/TableProperties;)V"); + } +}; + +class WriteStallInfoJni : public JavaClass { + public: + static jobject fromCppWriteStallInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::WriteStallInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name); + if (env->ExceptionCheck()) { + return nullptr; + } + return env->NewObject(jclazz, ctor, jcf_name, + static_cast(info->condition.cur), + static_cast(info->condition.prev)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/WriteStallInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(Ljava/lang/String;BB)V"); + } +}; + +class FileOperationInfoJni : public JavaClass { + public: + static jobject fromCppFileOperationInfo( + JNIEnv* env, const ROCKSDB_NAMESPACE::FileOperationInfo* info) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + jstring jpath = JniUtil::toJavaString(env, &info->path); + if (env->ExceptionCheck()) { + return nullptr; + } + jobject jstatus = StatusJni::construct(env, info->status); + if (jstatus == nullptr) { + env->DeleteLocalRef(jpath); + return nullptr; + } + return env->NewObject( + jclazz, ctor, jpath, static_cast(info->offset), + static_cast(info->length), + static_cast(info->start_ts.time_since_epoch().count()), + static_cast(info->duration.count()), jstatus); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/FileOperationInfo"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", + "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V"); + } +}; } // namespace ROCKSDB_NAMESPACE #endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 0f463e9b3d5..054c0fc89fd 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -421,8 +421,8 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( std::vector cf_descriptors; cf_descriptors.reserve(jlen); - jboolean jcf_options_handles_is_copy = JNI_FALSE; - jlong *jcf_options_handles_elems = env->GetLongArrayElements(jcf_options_handles, &jcf_options_handles_is_copy); + jlong* jcf_options_handles_elems = + env->GetLongArrayElements(jcf_options_handles, nullptr); if(jcf_options_handles_elems == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -2343,9 +2343,7 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes( const jsize jlen = env->GetArrayLength(jrange_slice_handles); const size_t range_count = jlen / 2; - jboolean jranges_is_copy = JNI_FALSE; - jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, - &jranges_is_copy); + jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, nullptr); if (jranges == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -2430,14 +2428,13 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats( static_cast(count), static_cast(sizes)}; - const jsize jcount = static_cast(count); - jlongArray jsizes = env->NewLongArray(jcount); + jlongArray jsizes = env->NewLongArray(2); if (jsizes == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } - env->SetLongArrayRegion(jsizes, 0, jcount, results); + env->SetLongArrayRegion(jsizes, 0, 2, results); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException env->DeleteLocalRef(jsizes); @@ -3256,9 +3253,8 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange( reinterpret_cast(jcf_handle); } const jsize jlen = env->GetArrayLength(jrange_slice_handles); - jboolean jrange_slice_handles_is_copy = JNI_FALSE; - jlong *jrange_slice_handle = env->GetLongArrayElements( - jrange_slice_handles, &jrange_slice_handles_is_copy); + jlong* jrange_slice_handle = + env->GetLongArrayElements(jrange_slice_handles, nullptr); if (jrange_slice_handle == nullptr) { // exception occurred return nullptr; diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc index d9e58992bd6..00ee8e7e22e 100644 --- a/java/rocksjni/slice.cc +++ b/java/rocksjni/slice.cc @@ -228,6 +228,17 @@ void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jobject /*jobj*/, slice->remove_prefix(length); } +/* + * Class: org_rocksdb_DirectSlice + * Method: setLength0 + * Signature: (JI)V + */ +void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/, + jlong handle, jint length) { + auto* slice = reinterpret_cast(handle); + slice->size_ = length; +} + /* * Class: org_rocksdb_Slice * Method: disposeInternalBuf diff --git a/java/rocksjni/testable_event_listener.cc b/java/rocksjni/testable_event_listener.cc new file mode 100644 index 00000000000..2540f2ecb69 --- /dev/null +++ b/java/rocksjni/testable_event_listener.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include +#include +#include + +#include "include/org_rocksdb_test_TestableEventListener.h" +#include "rocksdb/listener.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" + +using namespace ROCKSDB_NAMESPACE; + +static TableProperties newTablePropertiesForTest() { + TableProperties table_properties; + table_properties.data_size = UINT64_MAX; + table_properties.index_size = UINT64_MAX; + table_properties.index_partitions = UINT64_MAX; + table_properties.top_level_index_size = UINT64_MAX; + table_properties.index_key_is_user_key = UINT64_MAX; + table_properties.index_value_is_delta_encoded = UINT64_MAX; + table_properties.filter_size = UINT64_MAX; + table_properties.raw_key_size = UINT64_MAX; + table_properties.raw_value_size = UINT64_MAX; + table_properties.num_data_blocks = UINT64_MAX; + table_properties.num_entries = UINT64_MAX; + table_properties.num_deletions = UINT64_MAX; + table_properties.num_merge_operands = UINT64_MAX; + table_properties.num_range_deletions = UINT64_MAX; + table_properties.format_version = UINT64_MAX; + table_properties.fixed_key_len = UINT64_MAX; + table_properties.column_family_id = UINT64_MAX; + table_properties.creation_time = UINT64_MAX; + table_properties.oldest_key_time = UINT64_MAX; + table_properties.file_creation_time = UINT64_MAX; + table_properties.slow_compression_estimated_data_size = UINT64_MAX; + table_properties.fast_compression_estimated_data_size = UINT64_MAX; + table_properties.db_id = "dbId"; + table_properties.db_session_id = "sessionId"; + table_properties.column_family_name = "columnFamilyName"; + table_properties.filter_policy_name = "filterPolicyName"; + table_properties.comparator_name = "comparatorName"; + table_properties.merge_operator_name = "mergeOperatorName"; + table_properties.prefix_extractor_name = "prefixExtractorName"; + table_properties.property_collectors_names = "propertyCollectorsNames"; + table_properties.compression_name = "compressionName"; + table_properties.compression_options = "compressionOptions"; + table_properties.user_collected_properties = {{"key", "value"}}; + table_properties.readable_properties = {{"key", "value"}}; + table_properties.properties_offsets = {{"key", UINT64_MAX}}; + return table_properties; +} + +/* + * Class: org_rocksdb_test_TestableEventListener + * Method: invokeAllCallbacks + * Signature: (J)V + */ +void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks( + JNIEnv *, jclass, jlong jhandle) { + const auto &el = + *reinterpret_cast *>( + jhandle); + + TableProperties table_properties = newTablePropertiesForTest(); + + FlushJobInfo flush_job_info; + flush_job_info.cf_id = INT_MAX; + flush_job_info.cf_name = "testColumnFamily"; + flush_job_info.file_path = "/file/path"; + flush_job_info.file_number = UINT64_MAX; + flush_job_info.oldest_blob_file_number = UINT64_MAX; + flush_job_info.thread_id = UINT64_MAX; + flush_job_info.job_id = INT_MAX; + flush_job_info.triggered_writes_slowdown = true; + flush_job_info.triggered_writes_stop = true; + flush_job_info.smallest_seqno = UINT64_MAX; + flush_job_info.largest_seqno = UINT64_MAX; + flush_job_info.table_properties = table_properties; + flush_job_info.flush_reason = FlushReason::kManualFlush; + + el->OnFlushCompleted(nullptr, flush_job_info); + el->OnFlushBegin(nullptr, flush_job_info); + + Status status = Status::Incomplete(Status::SubCode::kNoSpace); + + TableFileDeletionInfo file_deletion_info; + file_deletion_info.db_name = "dbName"; + file_deletion_info.file_path = "/file/path"; + file_deletion_info.job_id = INT_MAX; + file_deletion_info.status = status; + + el->OnTableFileDeleted(file_deletion_info); + + CompactionJobInfo compaction_job_info; + compaction_job_info.cf_id = UINT32_MAX; + compaction_job_info.cf_name = "compactionColumnFamily"; + compaction_job_info.status = status; + compaction_job_info.thread_id = UINT64_MAX; + compaction_job_info.job_id = INT_MAX; + compaction_job_info.base_input_level = INT_MAX; + compaction_job_info.output_level = INT_MAX; + compaction_job_info.input_files = {"inputFile.sst"}; + compaction_job_info.input_file_infos = {}; + compaction_job_info.output_files = {"outputFile.sst"}; + compaction_job_info.output_file_infos = {}; + compaction_job_info.table_properties = { + {"tableProperties", std::shared_ptr( + &table_properties, [](TableProperties *) {})}}; + compaction_job_info.compaction_reason = CompactionReason::kFlush; + compaction_job_info.compression = CompressionType::kSnappyCompression; + + compaction_job_info.stats = CompactionJobStats(); + + el->OnCompactionBegin(nullptr, compaction_job_info); + el->OnCompactionCompleted(nullptr, compaction_job_info); + + TableFileCreationInfo file_creation_info; + file_creation_info.file_size = UINT64_MAX; + file_creation_info.table_properties = table_properties; + file_creation_info.status = status; + file_creation_info.file_checksum = "fileChecksum"; + file_creation_info.file_checksum_func_name = "fileChecksumFuncName"; + file_creation_info.db_name = "dbName"; + file_creation_info.cf_name = "columnFamilyName"; + file_creation_info.file_path = "/file/path"; + file_creation_info.job_id = INT_MAX; + file_creation_info.reason = TableFileCreationReason::kMisc; + + el->OnTableFileCreated(file_creation_info); + + TableFileCreationBriefInfo file_creation_brief_info; + file_creation_brief_info.db_name = "dbName"; + file_creation_brief_info.cf_name = "columnFamilyName"; + file_creation_brief_info.file_path = "/file/path"; + file_creation_brief_info.job_id = INT_MAX; + file_creation_brief_info.reason = TableFileCreationReason::kMisc; + + el->OnTableFileCreationStarted(file_creation_brief_info); + + MemTableInfo mem_table_info; + mem_table_info.cf_name = "columnFamilyName"; + mem_table_info.first_seqno = UINT64_MAX; + mem_table_info.earliest_seqno = UINT64_MAX; + mem_table_info.num_entries = UINT64_MAX; + mem_table_info.num_deletes = UINT64_MAX; + + el->OnMemTableSealed(mem_table_info); + el->OnColumnFamilyHandleDeletionStarted(nullptr); + + ExternalFileIngestionInfo file_ingestion_info; + file_ingestion_info.cf_name = "columnFamilyName"; + file_ingestion_info.external_file_path = "/external/file/path"; + file_ingestion_info.internal_file_path = "/internal/file/path"; + file_ingestion_info.global_seqno = UINT64_MAX; + file_ingestion_info.table_properties = table_properties; + el->OnExternalFileIngested(nullptr, file_ingestion_info); + + el->OnBackgroundError(BackgroundErrorReason::kFlush, &status); + + WriteStallInfo write_stall_info; + write_stall_info.cf_name = "columnFamilyName"; + write_stall_info.condition.cur = WriteStallCondition::kDelayed; + write_stall_info.condition.prev = WriteStallCondition::kStopped; + el->OnStallConditionsChanged(write_stall_info); + + FileOperationInfo op_info = FileOperationInfo( + FileOperationType::kRead, "/file/path", + std::make_pair(std::chrono::time_point( + std::chrono::nanoseconds(1600699420000000000ll)), + std::chrono::time_point( + std::chrono::nanoseconds(1600699420000000000ll))), + std::chrono::time_point( + std::chrono::nanoseconds(1600699425000000000ll)), + status); + op_info.offset = UINT64_MAX; + op_info.length = SIZE_MAX; + op_info.status = status; + + el->OnFileReadFinish(op_info); + el->OnFileWriteFinish(op_info); + el->OnFileFlushFinish(op_info); + el->OnFileSyncFinish(op_info); + el->OnFileRangeSyncFinish(op_info); + el->OnFileTruncateFinish(op_info); + el->OnFileCloseFinish(op_info); + el->ShouldBeNotifiedOnFileIO(); + + bool auto_recovery; + el->OnErrorRecoveryBegin(BackgroundErrorReason::kFlush, status, + &auto_recovery); + el->OnErrorRecoveryCompleted(status); +} diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc index 3b9123ed13e..720285e7555 100644 --- a/java/rocksjni/transaction.cc +++ b/java/rocksjni/transaction.cc @@ -689,6 +689,7 @@ void txn_write_kv_parts_helper(JNIEnv* env, // out of memory env->DeleteLocalRef(jobj_value_part); env->DeleteLocalRef(jobj_key_part); + env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); free_parts(env, jparts_to_free); return; } @@ -698,6 +699,7 @@ void txn_write_kv_parts_helper(JNIEnv* env, env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT); env->DeleteLocalRef(jobj_value_part); env->DeleteLocalRef(jobj_key_part); + env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); free_parts(env, jparts_to_free); return; } diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc index 77d17c82a3a..a898bffb80e 100644 --- a/java/rocksjni/ttl.cc +++ b/java/rocksjni/ttl.cc @@ -197,7 +197,7 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl( *cfOptions, std::string(reinterpret_cast(cfname), len), &handle, jttl); - env->ReleaseByteArrayElements(jcolumn_name, cfname, 0); + env->ReleaseByteArrayElements(jcolumn_name, cfname, JNI_ABORT); if (s.ok()) { return reinterpret_cast(handle); diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index c517afcc129..dd6e0b36c7a 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -47,7 +47,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env, ROCKSDB_NAMESPACE::WriteBufferManager wb(options.db_write_buffer_size); options.memtable_factory = factory; ROCKSDB_NAMESPACE::MemTable* mem = new ROCKSDB_NAMESPACE::MemTable( - cmp, ROCKSDB_NAMESPACE::ImmutableCFOptions(options), + cmp, ROCKSDB_NAMESPACE::ImmutableOptions(options), ROCKSDB_NAMESPACE::MutableCFOptions(options), &wb, ROCKSDB_NAMESPACE::kMaxSequenceNumber, 0 /* column_family_id */); mem->Ref(); @@ -63,10 +63,10 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env, for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ROCKSDB_NAMESPACE::ParsedInternalKey ikey; ikey.clear(); - ROCKSDB_NAMESPACE::Status pikStatus = - ROCKSDB_NAMESPACE::ParseInternalKey(iter->key(), &ikey); - pikStatus.PermitUncheckedError(); - assert(pikStatus.ok()); + ROCKSDB_NAMESPACE::Status pik_status = ROCKSDB_NAMESPACE::ParseInternalKey( + iter->key(), &ikey, true /* log_err_key */); + pik_status.PermitUncheckedError(); + assert(pik_status.ok()); switch (ikey.type) { case ROCKSDB_NAMESPACE::kTypeValue: state.append("Put("); diff --git a/java/samples/src/main/java/OptimisticTransactionSample.java b/java/samples/src/main/java/OptimisticTransactionSample.java index 1633d1f2bd4..7e7a22e9485 100644 --- a/java/samples/src/main/java/OptimisticTransactionSample.java +++ b/java/samples/src/main/java/OptimisticTransactionSample.java @@ -111,7 +111,7 @@ private static void repeatableRead(final OptimisticTransactionDB txnDb, // Read a key using the snapshot. readOptions.setSnapshot(snapshot); final byte[] value = txn.getForUpdate(readOptions, key1, true); - assert(value == value1); + assert (value == null); try { // Attempt to commit transaction diff --git a/java/samples/src/main/java/RocksDBColumnFamilySample.java b/java/samples/src/main/java/RocksDBColumnFamilySample.java index 650b1b2f600..72f5731a1bd 100644 --- a/java/samples/src/main/java/RocksDBColumnFamilySample.java +++ b/java/samples/src/main/java/RocksDBColumnFamilySample.java @@ -53,8 +53,8 @@ public static void main(final String[] args) throws RocksDBException { try { // put and get from non-default column family - db.put(columnFamilyHandles.get(0), new WriteOptions(), - "key".getBytes(), "value".getBytes()); + db.put( + columnFamilyHandles.get(1), new WriteOptions(), "key".getBytes(), "value".getBytes()); // atomic write try (final WriteBatch wb = new WriteBatch()) { @@ -62,7 +62,7 @@ public static void main(final String[] args) throws RocksDBException { "value2".getBytes()); wb.put(columnFamilyHandles.get(1), "key3".getBytes(), "value3".getBytes()); - wb.remove(columnFamilyHandles.get(0), "key".getBytes()); + wb.delete(columnFamilyHandles.get(1), "key".getBytes()); db.write(new WriteOptions(), wb); } diff --git a/java/samples/src/main/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java index f61995ed98f..0637c11148b 100644 --- a/java/samples/src/main/java/RocksDBSample.java +++ b/java/samples/src/main/java/RocksDBSample.java @@ -45,7 +45,7 @@ public static void main(final String[] args) { .setStatistics(stats) .setWriteBufferSize(8 * SizeUnit.KB) .setMaxWriteBufferNumber(3) - .setMaxBackgroundCompactions(10) + .setMaxBackgroundJobs(10) .setCompressionType(CompressionType.SNAPPY_COMPRESSION) .setCompactionStyle(CompactionStyle.UNIVERSAL); } catch (final IllegalArgumentException e) { @@ -55,7 +55,7 @@ public static void main(final String[] args) { assert (options.createIfMissing() == true); assert (options.writeBufferSize() == 8 * SizeUnit.KB); assert (options.maxWriteBufferNumber() == 3); - assert (options.maxBackgroundCompactions() == 10); + assert (options.maxBackgroundJobs() == 10); assert (options.compressionType() == CompressionType.SNAPPY_COMPRESSION); assert (options.compactionStyle() == CompactionStyle.UNIVERSAL); @@ -87,24 +87,17 @@ public static void main(final String[] args) { options.setRateLimiter(rateLimiter); final BlockBasedTableConfig table_options = new BlockBasedTableConfig(); - table_options.setBlockCacheSize(64 * SizeUnit.KB) - .setFilter(bloomFilter) - .setCacheNumShardBits(6) + Cache cache = new LRUCache(64 * 1024, 6); + table_options.setBlockCache(cache) + .setFilterPolicy(bloomFilter) .setBlockSizeDeviation(5) .setBlockRestartInterval(10) .setCacheIndexAndFilterBlocks(true) - .setHashIndexAllowCollision(false) - .setBlockCacheCompressedSize(64 * SizeUnit.KB) - .setBlockCacheCompressedNumShardBits(10); + .setBlockCacheCompressed(new LRUCache(64 * 1000, 10)); - assert (table_options.blockCacheSize() == 64 * SizeUnit.KB); - assert (table_options.cacheNumShardBits() == 6); assert (table_options.blockSizeDeviation() == 5); assert (table_options.blockRestartInterval() == 10); assert (table_options.cacheIndexAndFilterBlocks() == true); - assert (table_options.hashIndexAllowCollision() == false); - assert (table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB); - assert (table_options.blockCacheCompressedNumShardBits() == 10); options.setTableFormatConfig(table_options); assert (options.tableFactoryName().equals("BlockBasedTable")); @@ -203,14 +196,14 @@ public static void main(final String[] args) { len = db.get(readOptions, testKey, enoughArray); assert (len == testValue.length); - db.remove(testKey); + db.delete(testKey); len = db.get(testKey, enoughArray); assert (len == RocksDB.NOT_FOUND); // repeat the test with WriteOptions try (final WriteOptions writeOpts = new WriteOptions()) { writeOpts.setSync(true); - writeOpts.setDisableWAL(true); + writeOpts.setDisableWAL(false); db.put(writeOpts, testKey, testValue); len = db.get(testKey, enoughArray); assert (len == testValue.length); @@ -284,15 +277,15 @@ public static void main(final String[] args) { } } - Map values = db.multiGet(keys); + List values = db.multiGetAsList(keys); assert (values.size() == keys.size()); - for (final byte[] value1 : values.values()) { + for (final byte[] value1 : values) { assert (value1 != null); } - values = db.multiGet(new ReadOptions(), keys); + values = db.multiGetAsList(new ReadOptions(), keys); assert (values.size() == keys.size()); - for (final byte[] value1 : values.values()) { + for (final byte[] value1 : values) { assert (value1 != null); } } catch (final RocksDBException e) { diff --git a/java/src/main/java/org/rocksdb/AbstractEventListener.java b/java/src/main/java/org/rocksdb/AbstractEventListener.java new file mode 100644 index 00000000000..6698acf88f2 --- /dev/null +++ b/java/src/main/java/org/rocksdb/AbstractEventListener.java @@ -0,0 +1,334 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.rocksdb.AbstractEventListener.EnabledEventCallback.*; + +/** + * Base class for Event Listeners. + */ +public abstract class AbstractEventListener extends RocksCallbackObject implements EventListener { + public enum EnabledEventCallback { + ON_FLUSH_COMPLETED((byte) 0x0), + ON_FLUSH_BEGIN((byte) 0x1), + ON_TABLE_FILE_DELETED((byte) 0x2), + ON_COMPACTION_BEGIN((byte) 0x3), + ON_COMPACTION_COMPLETED((byte) 0x4), + ON_TABLE_FILE_CREATED((byte) 0x5), + ON_TABLE_FILE_CREATION_STARTED((byte) 0x6), + ON_MEMTABLE_SEALED((byte) 0x7), + ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED((byte) 0x8), + ON_EXTERNAL_FILE_INGESTED((byte) 0x9), + ON_BACKGROUND_ERROR((byte) 0xA), + ON_STALL_CONDITIONS_CHANGED((byte) 0xB), + ON_FILE_READ_FINISH((byte) 0xC), + ON_FILE_WRITE_FINISH((byte) 0xD), + ON_FILE_FLUSH_FINISH((byte) 0xE), + ON_FILE_SYNC_FINISH((byte) 0xF), + ON_FILE_RANGE_SYNC_FINISH((byte) 0x10), + ON_FILE_TRUNCATE_FINISH((byte) 0x11), + ON_FILE_CLOSE_FINISH((byte) 0x12), + SHOULD_BE_NOTIFIED_ON_FILE_IO((byte) 0x13), + ON_ERROR_RECOVERY_BEGIN((byte) 0x14), + ON_ERROR_RECOVERY_COMPLETED((byte) 0x15); + + private final byte value; + + EnabledEventCallback(final byte value) { + this.value = value; + } + + /** + * Get the internal representation value. + * + * @return the internal representation value + */ + byte getValue() { + return value; + } + + /** + * Get the EnabledEventCallbacks from the internal representation value. + * + * @return the enabled event callback. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static EnabledEventCallback fromValue(final byte value) { + for (final EnabledEventCallback enabledEventCallback : EnabledEventCallback.values()) { + if (enabledEventCallback.value == value) { + return enabledEventCallback; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for EnabledEventCallback: " + value); + } + } + + /** + * Creates an Event Listener that will + * received all callbacks from C++. + * + * If you don't need all callbacks, it is much more efficient to + * just register for the ones you need by calling + * {@link #AbstractEventListener(EnabledEventCallback...)} instead. + */ + protected AbstractEventListener() { + this(ON_FLUSH_COMPLETED, ON_FLUSH_BEGIN, ON_TABLE_FILE_DELETED, ON_COMPACTION_BEGIN, + ON_COMPACTION_COMPLETED, ON_TABLE_FILE_CREATED, ON_TABLE_FILE_CREATION_STARTED, + ON_MEMTABLE_SEALED, ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, ON_EXTERNAL_FILE_INGESTED, + ON_BACKGROUND_ERROR, ON_STALL_CONDITIONS_CHANGED, ON_FILE_READ_FINISH, ON_FILE_WRITE_FINISH, + ON_FILE_FLUSH_FINISH, ON_FILE_SYNC_FINISH, ON_FILE_RANGE_SYNC_FINISH, + ON_FILE_TRUNCATE_FINISH, ON_FILE_CLOSE_FINISH, SHOULD_BE_NOTIFIED_ON_FILE_IO, + ON_ERROR_RECOVERY_BEGIN, ON_ERROR_RECOVERY_COMPLETED); + } + + /** + * Creates an Event Listener that will + * receive only certain callbacks from C++. + * + * @param enabledEventCallbacks callbacks to enable in Java. + */ + protected AbstractEventListener(final EnabledEventCallback... enabledEventCallbacks) { + super(packToLong(enabledEventCallbacks)); + } + + /** + * Pack EnabledEventCallbacks to a long. + * + * @param enabledEventCallbacks the flags + * + * @return a long + */ + private static long packToLong(final EnabledEventCallback... enabledEventCallbacks) { + long l = 0; + for (int i = 0; i < enabledEventCallbacks.length; i++) { + l |= 1 << enabledEventCallbacks[i].getValue(); + } + return l; + } + + @Override + public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onFlushCompleted(RocksDB, FlushJobInfo)}. + * + * @param dbHandle native handle of the database + * @param flushJobInfo the flush job info + */ + private void onFlushCompletedProxy(final long dbHandle, final FlushJobInfo flushJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onFlushCompleted(db, flushJobInfo); + } + + @Override + public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onFlushBegin(RocksDB, FlushJobInfo)}. + * + * @param dbHandle native handle of the database + * @param flushJobInfo the flush job info + */ + private void onFlushBeginProxy(final long dbHandle, final FlushJobInfo flushJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onFlushBegin(db, flushJobInfo); + } + + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + // no-op + } + + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onCompactionBegin(RocksDB, CompactionJobInfo)}. + * + * @param dbHandle native handle of the database + * @param compactionJobInfo the flush job info + */ + private void onCompactionBeginProxy( + final long dbHandle, final CompactionJobInfo compactionJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onCompactionBegin(db, compactionJobInfo); + } + + @Override + public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}. + * + * @param dbHandle native handle of the database + * @param compactionJobInfo the flush job info + */ + private void onCompactionCompletedProxy( + final long dbHandle, final CompactionJobInfo compactionJobInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onCompactionCompleted(db, compactionJobInfo); + } + + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + // no-op + } + + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + // no-op + } + + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + // no-op + } + + @Override + public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) { + // no-op + } + + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onExternalFileIngested(RocksDB, ExternalFileIngestionInfo)}. + * + * @param dbHandle native handle of the database + * @param externalFileIngestionInfo the flush job info + */ + private void onExternalFileIngestedProxy( + final long dbHandle, final ExternalFileIngestionInfo externalFileIngestionInfo) { + final RocksDB db = new RocksDB(dbHandle); + db.disOwnNativeHandle(); // we don't own this! + onExternalFileIngested(db, externalFileIngestionInfo); + } + + @Override + public void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + // no-op + } + + /** + * Called from JNI, proxy for + * {@link #onBackgroundError(BackgroundErrorReason, Status)}. + * + * @param reasonByte byte value representing error reason + * @param backgroundError status with error code + */ + private void onBackgroundErrorProxy(final byte reasonByte, final Status backgroundError) { + onBackgroundError(BackgroundErrorReason.fromValue(reasonByte), backgroundError); + } + + @Override + public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) { + // no-op + } + + @Override + public void onFileReadFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) { + // no-op + } + + @Override + public boolean shouldBeNotifiedOnFileIO() { + return false; + } + + @Override + public boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + return true; + } + + /** + * Called from JNI, proxy for + * {@link #onErrorRecoveryBegin(BackgroundErrorReason, Status)}. + * + * @param reasonByte byte value representing error reason + * @param backgroundError status with error code + */ + private boolean onErrorRecoveryBeginProxy(final byte reasonByte, final Status backgroundError) { + return onErrorRecoveryBegin(BackgroundErrorReason.fromValue(reasonByte), backgroundError); + } + + @Override + public void onErrorRecoveryCompleted(final Status oldBackgroundError) { + // no-op + } + + @Override + protected long initializeNative(final long... nativeParameterHandles) { + return createNewEventListener(nativeParameterHandles[0]); + } + + /** + * Deletes underlying C++ native callback object pointer + */ + @Override + protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + private native long createNewEventListener(final long enabledEventCallbackValues); + private native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java index 772a5900b5b..76d9bde4646 100644 --- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java @@ -301,7 +301,7 @@ T setCompressionPerLevel( * @return the reference to the current options. */ @Experimental("Turning this feature on or off for an existing DB can cause" + - "unexpected LSM tree structure so it's not recommended") + " unexpected LSM tree structure so it's not recommended") T setLevelCompactionDynamicLevelBytes( boolean enableLevelCompactionDynamicLevelBytes); diff --git a/java/src/main/java/org/rocksdb/BackgroundErrorReason.java b/java/src/main/java/org/rocksdb/BackgroundErrorReason.java new file mode 100644 index 00000000000..eec593d35c5 --- /dev/null +++ b/java/src/main/java/org/rocksdb/BackgroundErrorReason.java @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum BackgroundErrorReason { + FLUSH((byte) 0x0), + COMPACTION((byte) 0x1), + WRITE_CALLBACK((byte) 0x2), + MEMTABLE((byte) 0x3); + + private final byte value; + + BackgroundErrorReason(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the BackgroundErrorReason from the internal representation value. + * + * @return the background error reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static BackgroundErrorReason fromValue(final byte value) { + for (final BackgroundErrorReason backgroundErrorReason : BackgroundErrorReason.values()) { + if (backgroundErrorReason.value == value) { + return backgroundErrorReason; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for BackgroundErrorReason: " + value); + } +} diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index 6730e645230..a8f436e2f0a 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -38,7 +38,7 @@ public BlockBasedTableConfig() { wholeKeyFiltering = true; verifyCompression = false; readAmpBytesPerBit = 0; - formatVersion = 4; + formatVersion = 5; enableIndexCompression = true; blockAlign = false; indexShortening = IndexShorteningMode.kShortenSeparators; diff --git a/java/src/main/java/org/rocksdb/Cache.java b/java/src/main/java/org/rocksdb/Cache.java index 3952e1d109c..569a1df06cf 100644 --- a/java/src/main/java/org/rocksdb/Cache.java +++ b/java/src/main/java/org/rocksdb/Cache.java @@ -10,4 +10,31 @@ public abstract class Cache extends RocksObject { protected Cache(final long nativeHandle) { super(nativeHandle); } + + /** + * Returns the memory size for the entries + * residing in cache. + * + * @return cache usage size. + * + */ + public long getUsage() { + assert (isOwningHandle()); + return getUsage(this.nativeHandle_); + } + + /** + * Returns the memory size for the entries + * being pinned in cache. + * + * @return cache pinned usage size. + * + */ + public long getPinnedUsage() { + assert (isOwningHandle()); + return getPinnedUsage(this.nativeHandle_); + } + + private native static long getUsage(final long handle); + private native static long getPinnedUsage(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java index 3a2e97efb57..1ac0a35bbd7 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -13,6 +13,12 @@ * ColumnFamily Pointers. */ public class ColumnFamilyHandle extends RocksObject { + /** + * Constructs column family Java object, which operates on underlying native object. + * + * @param rocksDB db instance associated with this column family + * @param nativeHandle native handle to underlying native ColumnFamily object + */ ColumnFamilyHandle(final RocksDB rocksDB, final long nativeHandle) { super(nativeHandle); @@ -24,6 +30,28 @@ public class ColumnFamilyHandle extends RocksObject { this.rocksDB_ = rocksDB; } + /** + * Constructor called only from JNI. + * + * NOTE: we are producing an additional Java Object here to represent the underlying native C++ + * ColumnFamilyHandle object. The underlying object is not owned by ourselves. The Java API user + * likely already had a ColumnFamilyHandle Java object which owns the underlying C++ object, as + * they will have been presented it when they opened the database or added a Column Family. + * + * + * TODO(AR) - Potentially a better design would be to cache the active Java Column Family Objects + * in RocksDB, and return the same Java Object instead of instantiating a new one here. This could + * also help us to improve the Java API semantics for Java users. See for example + * https://github.com/facebook/rocksdb/issues/2687. + * + * @param nativeHandle native handle to the column family. + */ + ColumnFamilyHandle(final long nativeHandle) { + super(nativeHandle); + rocksDB_ = null; + disOwnNativeHandle(); + } + /** * Gets the name of the Column Family. * @@ -87,7 +115,9 @@ public boolean equals(final Object o) { @Override public int hashCode() { try { - return Objects.hash(getName(), getID(), rocksDB_.nativeHandle_); + int result = Objects.hash(getID(), rocksDB_.nativeHandle_); + result = 31 * result + Arrays.hashCode(getName()); + return result; } catch (RocksDBException e) { throw new RuntimeException("Cannot calculate hash code of column family handle", e); } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index 825c34973c5..72149bf2669 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -339,7 +339,7 @@ public List compressionPerLevel() { final byte[] byteCompressionTypes = compressionPerLevel(nativeHandle_); final List compressionLevels = new ArrayList<>(); - for (final Byte byteCompressionType : byteCompressionTypes) { + for (final byte byteCompressionType : byteCompressionTypes) { compressionLevels.add(CompressionType.getCompressionType( byteCompressionType)); } diff --git a/java/src/main/java/org/rocksdb/CompactionJobInfo.java b/java/src/main/java/org/rocksdb/CompactionJobInfo.java index 8b59edc91db..4e3b8d68b82 100644 --- a/java/src/main/java/org/rocksdb/CompactionJobInfo.java +++ b/java/src/main/java/org/rocksdb/CompactionJobInfo.java @@ -20,6 +20,8 @@ public CompactionJobInfo() { */ private CompactionJobInfo(final long nativeHandle) { super(nativeHandle); + // We do not own the native object! + disOwnNativeHandle(); } /** diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index a3eef513e31..2930a92728b 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -884,32 +884,18 @@ public boolean strictBytesPerSync() { return strictBytesPerSync(nativeHandle_); } - //TODO(AR) NOW -// @Override -// public DBOptions setListeners(final List listeners) { -// assert(isOwningHandle()); -// final long[] eventListenerHandlers = new long[listeners.size()]; -// for (int i = 0; i < eventListenerHandlers.length; i++) { -// eventListenerHandlers[i] = listeners.get(i).nativeHandle_; -// } -// setEventListeners(nativeHandle_, eventListenerHandlers); -// return this; -// } -// -// @Override -// public Collection listeners() { -// assert(isOwningHandle()); -// final long[] eventListenerHandlers = listeners(nativeHandle_); -// if (eventListenerHandlers == null || eventListenerHandlers.length == 0) { -// return Collections.emptyList(); -// } -// -// final List eventListeners = new ArrayList<>(); -// for (final long eventListenerHandle : eventListenerHandlers) { -// eventListeners.add(new EventListener(eventListenerHandle)); //TODO(AR) check ownership is set to false! -// } -// return eventListeners; -// } + @Override + public DBOptions setListeners(final List listeners) { + assert (isOwningHandle()); + setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners)); + return this; + } + + @Override + public List listeners() { + assert (isOwningHandle()); + return Arrays.asList(eventListeners(nativeHandle_)); + } @Override public DBOptions setEnableThreadTracking(final boolean enableThreadTracking) { @@ -1459,6 +1445,9 @@ private native void setStrictBytesPerSync( final long handle, final boolean strictBytesPerSync); private native boolean strictBytesPerSync( final long handle); + private static native void setEventListeners( + final long handle, final long[] eventListenerHandles); + private static native AbstractEventListener[] eventListeners(final long handle); private native void setEnableThreadTracking(long handle, boolean enableThreadTracking); private native boolean enableThreadTracking(long handle); diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index 72be7d0e647..6609e0ad307 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -625,7 +625,7 @@ public interface DBOptionsInterface> { * then WAL_size_limit_MB, they will be deleted starting with the * earliest until size_limit is met. All empty files will be deleted. *
  • If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * WAL files will be checked every WAL_ttl_seconds / 2 and those that * are older than WAL_ttl_seconds will be deleted.
  • *
  • If both are not 0, WAL files will be checked every 10 min and both * checks will be performed with ttl being first.
  • @@ -648,7 +648,7 @@ public interface DBOptionsInterface> { * then WAL_size_limit_MB, they will be deleted starting with the * earliest until size_limit is met. All empty files will be deleted. *
  • If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * WAL files will be checked every WAL_ttl_seconds / 2 and those that * are older than WAL_ttl_seconds will be deleted.
  • *
  • If both are not 0, WAL files will be checked every 10 min and both * checks will be performed with ttl being first.
  • @@ -1055,24 +1055,31 @@ T setNewTableReaderForCompactionInputs( */ boolean useAdaptiveMutex(); - //TODO(AR) NOW -// /** -// * Sets the {@link EventListener}s whose callback functions -// * will be called when specific RocksDB event happens. -// * -// * @param listeners the listeners who should be notified on various events. -// * -// * @return the instance of the current object. -// */ -// T setListeners(final List listeners); -// -// /** -// * Gets the {@link EventListener}s whose callback functions -// * will be called when specific RocksDB event happens. -// * -// * @return a collection of Event listeners. -// */ -// Collection listeners(); + /** + * Sets the {@link EventListener}s whose callback functions + * will be called when specific RocksDB event happens. + * + * Note: the RocksJava API currently only supports EventListeners implemented in Java. + * It could be extended in future to also support adding/removing EventListeners implemented in + * C++. + * + * @param listeners the listeners who should be notified on various events. + * + * @return the instance of the current object. + */ + T setListeners(final List listeners); + + /** + * Sets the {@link EventListener}s whose callback functions + * will be called when specific RocksDB event happens. + * + * Note: the RocksJava API currently only supports EventListeners implemented in Java. + * It could be extended in future to also support adding/removing EventListeners implemented in + * C++. + * + * @return the instance of the current object. + */ + List listeners(); /** * If true, then the status of the threads involved in this DB will diff --git a/java/src/main/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java index b0d35c3cc5a..02fa3511fc0 100644 --- a/java/src/main/java/org/rocksdb/DirectSlice.java +++ b/java/src/main/java/org/rocksdb/DirectSlice.java @@ -110,6 +110,10 @@ public void removePrefix(final int n) { this.internalBufferOffset += n; } + public void setLength(final int n) { + setLength0(getNativeHandle(), n); + } + @Override protected void disposeInternal() { final long nativeHandle = getNativeHandle(); @@ -127,6 +131,7 @@ private native static long createNewDirectSlice0(final ByteBuffer data, private native void clear0(long handle, boolean internalBuffer, long internalBufferOffset); private native void removePrefix0(long handle, int length); + private native void setLength0(long handle, int length); private native void disposeInternalBuf(final long handle, long internalBufferOffset); } diff --git a/java/src/main/java/org/rocksdb/EventListener.java b/java/src/main/java/org/rocksdb/EventListener.java new file mode 100644 index 00000000000..808278d02b1 --- /dev/null +++ b/java/src/main/java/org/rocksdb/EventListener.java @@ -0,0 +1,332 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.List; + +/** + * EventListener class contains a set of callback functions that will + * be called when specific RocksDB event happens such as flush. It can + * be used as a building block for developing custom features such as + * stats-collector or external compaction algorithm. + * + * Note that callback functions should not run for an extended period of + * time before the function returns, otherwise RocksDB may be blocked. + * For example, it is not suggested to do + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, + * CompactionJobInfo)} (as it may run for a long while) or issue many of + * {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} + * (as Put may be blocked in certain cases) in the same thread in the + * EventListener callback. + * + * However, doing + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, + * CompactionJobInfo)} and {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} in + * another thread is considered safe. + * + * [Threading] All EventListener callback will be called using the + * actual thread that involves in that specific event. For example, it + * is the RocksDB background flush thread that does the actual flush to + * call {@link #onFlushCompleted(RocksDB, FlushJobInfo)}. + * + * [Locking] All EventListener callbacks are designed to be called without + * the current thread holding any DB mutex. This is to prevent potential + * deadlock and performance issue when using EventListener callback + * in a complex way. + */ +public interface EventListener { + /** + * A callback function to RocksDB which will be called before a + * RocksDB starts to flush memtables. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db the database + * @param flushJobInfo the flush job info, contains data copied from + * respective native structure. + */ + void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo); + + /** + * callback function to RocksDB which will be called whenever a + * registered RocksDB flushes a file. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db the database + * @param flushJobInfo the flush job info, contains data copied from + * respective native structure. + */ + void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo); + + /** + * A callback function for RocksDB which will be called whenever + * a SST file is deleted. Different from + * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)} and + * {@link #onFlushCompleted(RocksDB, FlushJobInfo)}, + * this callback is designed for external logging + * service and thus only provide string parameters instead + * of a pointer to DB. Applications that build logic basic based + * on file creations and deletions is suggested to implement + * {@link #onFlushCompleted(RocksDB, FlushJobInfo)} and + * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from the + * returned value. + * + * @param tableFileDeletionInfo the table file deletion info, + * contains data copied from respective native structure. + */ + void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo); + + /** + * A callback function to RocksDB which will be called before a + * RocksDB starts to compact. The default implementation is + * no-op. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db a pointer to the rocksdb instance which just compacted + * a file. + * @param compactionJobInfo a reference to a native CompactionJobInfo struct, + * which is released after this function is returned, and must be copied + * if it is needed outside of this function. + */ + void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo); + + /** + * A callback function for RocksDB which will be called whenever + * a registered RocksDB compacts a file. The default implementation + * is a no-op. + * + * Note that this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param db a pointer to the rocksdb instance which just compacted + * a file. + * @param compactionJobInfo a reference to a native CompactionJobInfo struct, + * which is released after this function is returned, and must be copied + * if it is needed outside of this function. + */ + void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo); + + /** + * A callback function for RocksDB which will be called whenever + * a SST file is created. Different from OnCompactionCompleted and + * OnFlushCompleted, this callback is designed for external logging + * service and thus only provide string parameters instead + * of a pointer to DB. Applications that build logic basic based + * on file creations and deletions is suggested to implement + * OnFlushCompleted and OnCompactionCompleted. + * + * Historically it will only be called if the file is successfully created. + * Now it will also be called on failure case. User can check info.status + * to see if it succeeded or not. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from these + * returned value. + * + * @param tableFileCreationInfo the table file creation info, + * contains data copied from respective native structure. + */ + void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo); + + /** + * A callback function for RocksDB which will be called before + * a SST file is being created. It will follow by OnTableFileCreated after + * the creation finishes. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from these + * returned value. + * + * @param tableFileCreationBriefInfo the table file creation brief info, + * contains data copied from respective native structure. + */ + void onTableFileCreationStarted(final TableFileCreationBriefInfo tableFileCreationBriefInfo); + + /** + * A callback function for RocksDB which will be called before + * a memtable is made immutable. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * Note that if applications would like to use the passed reference + * outside this function call, they should make copies from these + * returned value. + * + * @param memTableInfo the mem table info, contains data + * copied from respective native structure. + */ + void onMemTableSealed(final MemTableInfo memTableInfo); + + /** + * A callback function for RocksDB which will be called before + * a column family handle is deleted. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param columnFamilyHandle is a pointer to the column family handle to be + * deleted which will become a dangling pointer after the deletion. + */ + void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle); + + /** + * A callback function for RocksDB which will be called after an external + * file is ingested using IngestExternalFile. + * + * Note that the this function will run on the same thread as + * IngestExternalFile(), if this function is blocked, IngestExternalFile() + * will be blocked from finishing. + * + * @param db the database + * @param externalFileIngestionInfo the external file ingestion info, + * contains data copied from respective native structure. + */ + void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo); + + /** + * A callback function for RocksDB which will be called before setting the + * background error status to a non-OK value. The new background error status + * is provided in `bg_error` and can be modified by the callback. E.g., a + * callback can suppress errors by resetting it to Status::OK(), thus + * preventing the database from entering read-only mode. We do not provide any + * guarantee when failed flushes/compactions will be rescheduled if the user + * suppresses an error. + * + * Note that this function can run on the same threads as flush, compaction, + * and user writes. So, it is extremely important not to perform heavy + * computations or blocking calls in this function. + * + * @param backgroundErrorReason background error reason code + * @param backgroundError background error codes + */ + void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError); + + /** + * A callback function for RocksDB which will be called whenever a change + * of superversion triggers a change of the stall conditions. + * + * Note that the this function must be implemented in a way such that + * it should not run for an extended period of time before the function + * returns. Otherwise, RocksDB may be blocked. + * + * @param writeStallInfo write stall info, + * contains data copied from respective native structure. + */ + void onStallConditionsChanged(final WriteStallInfo writeStallInfo); + + /** + * A callback function for RocksDB which will be called whenever a file read + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileReadFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file write + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileWriteFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file flush + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileFlushFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file sync + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileSyncFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file + * rangeSync operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file + * truncate operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileTruncateFinish(final FileOperationInfo fileOperationInfo); + + /** + * A callback function for RocksDB which will be called whenever a file close + * operation finishes. + * + * @param fileOperationInfo file operation info, + * contains data copied from respective native structure. + */ + void onFileCloseFinish(final FileOperationInfo fileOperationInfo); + + /** + * If true, the {@link #onFileReadFinish(FileOperationInfo)} + * and {@link #onFileWriteFinish(FileOperationInfo)} will be called. If + * false, then they won't be called. + * + * Default: false + */ + boolean shouldBeNotifiedOnFileIO(); + + /** + * A callback function for RocksDB which will be called just before + * starting the automatic recovery process for recoverable background + * errors, such as NoSpace(). The callback can suppress the automatic + * recovery by setting returning false. The database will then + * have to be transitioned out of read-only mode by calling + * RocksDB#resume(). + * + * @param backgroundErrorReason background error reason code + * @param backgroundError background error codes + */ + boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError); + + /** + * A callback function for RocksDB which will be called once the database + * is recovered from read-only mode after an error. When this is called, it + * means normal writes to the database can be issued and the user can + * initiate any further recovery actions needed + * + * @param oldBackgroundError old background error codes + */ + void onErrorRecoveryCompleted(final Status oldBackgroundError); +} diff --git a/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java b/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java new file mode 100644 index 00000000000..6b14a80240b --- /dev/null +++ b/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class ExternalFileIngestionInfo { + private final String columnFamilyName; + private final String externalFilePath; + private final String internalFilePath; + private final long globalSeqno; + private final TableProperties tableProperties; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + ExternalFileIngestionInfo(final String columnFamilyName, final String externalFilePath, + final String internalFilePath, final long globalSeqno, + final TableProperties tableProperties) { + this.columnFamilyName = columnFamilyName; + this.externalFilePath = externalFilePath; + this.internalFilePath = internalFilePath; + this.globalSeqno = globalSeqno; + this.tableProperties = tableProperties; + } + + /** + * Get the name of the column family. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the path of the file outside the DB. + * + * @return the path of the file outside the DB. + */ + public String getExternalFilePath() { + return externalFilePath; + } + + /** + * Get the path of the file inside the DB. + * + * @return the path of the file inside the DB. + */ + public String getInternalFilePath() { + return internalFilePath; + } + + /** + * Get the global sequence number assigned to keys in this file. + * + * @return the global sequence number. + */ + public long getGlobalSeqno() { + return globalSeqno; + } + + /** + * Get the Table properties of the table being flushed. + * + * @return the table properties. + */ + public TableProperties getTableProperties() { + return tableProperties; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o; + return globalSeqno == that.globalSeqno + && Objects.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(externalFilePath, that.externalFilePath) + && Objects.equals(internalFilePath, that.internalFilePath) + && Objects.equals(tableProperties, that.tableProperties); + } + + @Override + public int hashCode() { + return Objects.hash( + columnFamilyName, externalFilePath, internalFilePath, globalSeqno, tableProperties); + } + + @Override + public String toString() { + return "ExternalFileIngestionInfo{" + + "columnFamilyName='" + columnFamilyName + '\'' + ", externalFilePath='" + externalFilePath + + '\'' + ", internalFilePath='" + internalFilePath + '\'' + ", globalSeqno=" + globalSeqno + + ", tableProperties=" + tableProperties + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/FileOperationInfo.java b/java/src/main/java/org/rocksdb/FileOperationInfo.java new file mode 100644 index 00000000000..aa5743ed377 --- /dev/null +++ b/java/src/main/java/org/rocksdb/FileOperationInfo.java @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +/** + * Java representation of FileOperationInfo struct from include/rocksdb/listener.h + */ +public class FileOperationInfo { + private final String path; + private final long offset; + private final long length; + private final long startTimestamp; + private final long duration; + private final Status status; + + /** + * Access is private as this will only be constructed from + * C++ via JNI. + */ + FileOperationInfo(final String path, final long offset, final long length, + final long startTimestamp, final long duration, final Status status) { + this.path = path; + this.offset = offset; + this.length = length; + this.startTimestamp = startTimestamp; + this.duration = duration; + this.status = status; + } + + /** + * Get the file path. + * + * @return the file path. + */ + public String getPath() { + return path; + } + + /** + * Get the offset. + * + * @return the offset. + */ + public long getOffset() { + return offset; + } + + /** + * Get the length. + * + * @return the length. + */ + public long getLength() { + return length; + } + + /** + * Get the start timestamp (in nanoseconds). + * + * @return the start timestamp. + */ + public long getStartTimestamp() { + return startTimestamp; + } + + /** + * Get the operation duration (in nanoseconds). + * + * @return the operation duration. + */ + public long getDuration() { + return duration; + } + + /** + * Get the status. + * + * @return the status. + */ + public Status getStatus() { + return status; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + FileOperationInfo that = (FileOperationInfo) o; + return offset == that.offset && length == that.length && startTimestamp == that.startTimestamp + && duration == that.duration && Objects.equals(path, that.path) + && Objects.equals(status, that.status); + } + + @Override + public int hashCode() { + return Objects.hash(path, offset, length, startTimestamp, duration, status); + } + + @Override + public String toString() { + return "FileOperationInfo{" + + "path='" + path + '\'' + ", offset=" + offset + ", length=" + length + ", startTimestamp=" + + startTimestamp + ", duration=" + duration + ", status=" + status + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/FlushJobInfo.java b/java/src/main/java/org/rocksdb/FlushJobInfo.java new file mode 100644 index 00000000000..ca9aa05236b --- /dev/null +++ b/java/src/main/java/org/rocksdb/FlushJobInfo.java @@ -0,0 +1,186 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class FlushJobInfo { + private final long columnFamilyId; + private final String columnFamilyName; + private final String filePath; + private final long threadId; + private final int jobId; + private final boolean triggeredWritesSlowdown; + private final boolean triggeredWritesStop; + private final long smallestSeqno; + private final long largestSeqno; + private final TableProperties tableProperties; + private final FlushReason flushReason; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + FlushJobInfo(final long columnFamilyId, final String columnFamilyName, final String filePath, + final long threadId, final int jobId, final boolean triggeredWritesSlowdown, + final boolean triggeredWritesStop, final long smallestSeqno, final long largestSeqno, + final TableProperties tableProperties, final byte flushReasonValue) { + this.columnFamilyId = columnFamilyId; + this.columnFamilyName = columnFamilyName; + this.filePath = filePath; + this.threadId = threadId; + this.jobId = jobId; + this.triggeredWritesSlowdown = triggeredWritesSlowdown; + this.triggeredWritesStop = triggeredWritesStop; + this.smallestSeqno = smallestSeqno; + this.largestSeqno = largestSeqno; + this.tableProperties = tableProperties; + this.flushReason = FlushReason.fromValue(flushReasonValue); + } + + /** + * Get the id of the column family. + * + * @return the id of the column family + */ + public long getColumnFamilyId() { + return columnFamilyId; + } + + /** + * Get the name of the column family. + * + * @return the name of the column family + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the path to the newly created file. + * + * @return the path to the newly created file + */ + public String getFilePath() { + return filePath; + } + + /** + * Get the id of the thread that completed this flush job. + * + * @return the id of the thread that completed this flush job + */ + public long getThreadId() { + return threadId; + } + + /** + * Get the job id, which is unique in the same thread. + * + * @return the job id + */ + public int getJobId() { + return jobId; + } + + /** + * Determine if rocksdb is currently slowing-down all writes to prevent + * creating too many Level 0 files as compaction seems not able to + * catch up the write request speed. + * + * This indicates that there are too many files in Level 0. + * + * @return true if rocksdb is currently slowing-down all writes, + * false otherwise + */ + public boolean isTriggeredWritesSlowdown() { + return triggeredWritesSlowdown; + } + + /** + * Determine if rocksdb is currently blocking any writes to prevent + * creating more L0 files. + * + * This indicates that there are too many files in level 0. + * Compactions should try to compact L0 files down to lower levels as soon + * as possible. + * + * @return true if rocksdb is currently blocking any writes, false otherwise + */ + public boolean isTriggeredWritesStop() { + return triggeredWritesStop; + } + + /** + * Get the smallest sequence number in the newly created file. + * + * @return the smallest sequence number + */ + public long getSmallestSeqno() { + return smallestSeqno; + } + + /** + * Get the largest sequence number in the newly created file. + * + * @return the largest sequence number + */ + public long getLargestSeqno() { + return largestSeqno; + } + + /** + * Get the Table properties of the table being flushed. + * + * @return the Table properties of the table being flushed + */ + public TableProperties getTableProperties() { + return tableProperties; + } + + /** + * Get the reason for initiating the flush. + * + * @return the reason for initiating the flush. + */ + public FlushReason getFlushReason() { + return flushReason; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + FlushJobInfo that = (FlushJobInfo) o; + return columnFamilyId == that.columnFamilyId && threadId == that.threadId && jobId == that.jobId + && triggeredWritesSlowdown == that.triggeredWritesSlowdown + && triggeredWritesStop == that.triggeredWritesStop && smallestSeqno == that.smallestSeqno + && largestSeqno == that.largestSeqno + && Objects.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(filePath, that.filePath) + && Objects.equals(tableProperties, that.tableProperties) && flushReason == that.flushReason; + } + + @Override + public int hashCode() { + return Objects.hash(columnFamilyId, columnFamilyName, filePath, threadId, jobId, + triggeredWritesSlowdown, triggeredWritesStop, smallestSeqno, largestSeqno, tableProperties, + flushReason); + } + + @Override + public String toString() { + return "FlushJobInfo{" + + "columnFamilyId=" + columnFamilyId + ", columnFamilyName='" + columnFamilyName + '\'' + + ", filePath='" + filePath + '\'' + ", threadId=" + threadId + ", jobId=" + jobId + + ", triggeredWritesSlowdown=" + triggeredWritesSlowdown + + ", triggeredWritesStop=" + triggeredWritesStop + ", smallestSeqno=" + smallestSeqno + + ", largestSeqno=" + largestSeqno + ", tableProperties=" + tableProperties + + ", flushReason=" + flushReason + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/FlushReason.java b/java/src/main/java/org/rocksdb/FlushReason.java new file mode 100644 index 00000000000..9d486cda16b --- /dev/null +++ b/java/src/main/java/org/rocksdb/FlushReason.java @@ -0,0 +1,53 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum FlushReason { + OTHERS((byte) 0x00), + GET_LIVE_FILES((byte) 0x01), + SHUTDOWN((byte) 0x02), + EXTERNAL_FILE_INGESTION((byte) 0x03), + MANUAL_COMPACTION((byte) 0x04), + WRITE_BUFFER_MANAGER((byte) 0x05), + WRITE_BUFFER_FULL((byte) 0x06), + TEST((byte) 0x07), + DELETE_FILES((byte) 0x08), + AUTO_COMPACTION((byte) 0x09), + MANUAL_FLUSH((byte) 0x0a), + ERROR_RECOVERY((byte) 0xb); + + private final byte value; + + FlushReason(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the FlushReason from the internal representation value. + * + * @return the flush reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static FlushReason fromValue(final byte value) { + for (final FlushReason flushReason : FlushReason.values()) { + if (flushReason.value == value) { + return flushReason; + } + } + + throw new IllegalArgumentException("Illegal value provided for FlushReason: " + value); + } +} diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 80d7c600edb..5953a7d9bdd 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -175,6 +175,11 @@ public enum HistogramType { */ NUM_SST_READ_PER_LEVEL((byte) 0x31), + /** + * The number of retry in auto resume + */ + ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/MemTableInfo.java b/java/src/main/java/org/rocksdb/MemTableInfo.java new file mode 100644 index 00000000000..f4fb577c3a9 --- /dev/null +++ b/java/src/main/java/org/rocksdb/MemTableInfo.java @@ -0,0 +1,103 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class MemTableInfo { + private final String columnFamilyName; + private final long firstSeqno; + private final long earliestSeqno; + private final long numEntries; + private final long numDeletes; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + MemTableInfo(final String columnFamilyName, final long firstSeqno, final long earliestSeqno, + final long numEntries, final long numDeletes) { + this.columnFamilyName = columnFamilyName; + this.firstSeqno = firstSeqno; + this.earliestSeqno = earliestSeqno; + this.numEntries = numEntries; + this.numDeletes = numDeletes; + } + + /** + * Get the name of the column family to which memtable belongs. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the Sequence number of the first element that was inserted into the + * memtable. + * + * @return the sequence number of the first inserted element. + */ + public long getFirstSeqno() { + return firstSeqno; + } + + /** + * Get the Sequence number that is guaranteed to be smaller than or equal + * to the sequence number of any key that could be inserted into this + * memtable. It can then be assumed that any write with a larger(or equal) + * sequence number will be present in this memtable or a later memtable. + * + * @return the earliest sequence number. + */ + public long getEarliestSeqno() { + return earliestSeqno; + } + + /** + * Get the total number of entries in memtable. + * + * @return the total number of entries. + */ + public long getNumEntries() { + return numEntries; + } + + /** + * Get the total number of deletes in memtable. + * + * @return the total number of deletes. + */ + public long getNumDeletes() { + return numDeletes; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + MemTableInfo that = (MemTableInfo) o; + return firstSeqno == that.firstSeqno && earliestSeqno == that.earliestSeqno + && numEntries == that.numEntries && numDeletes == that.numDeletes + && Objects.equals(columnFamilyName, that.columnFamilyName); + } + + @Override + public int hashCode() { + return Objects.hash(columnFamilyName, firstSeqno, earliestSeqno, numEntries, numDeletes); + } + + @Override + public String toString() { + return "MemTableInfo{" + + "columnFamilyName='" + columnFamilyName + '\'' + ", firstSeqno=" + firstSeqno + + ", earliestSeqno=" + earliestSeqno + ", numEntries=" + numEntries + + ", numDeletes=" + numDeletes + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 95d1daa8097..57f3aeffbd3 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -970,6 +970,19 @@ public boolean strictBytesPerSync() { return strictBytesPerSync(nativeHandle_); } + @Override + public Options setListeners(final List listeners) { + assert (isOwningHandle()); + setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners)); + return this; + } + + @Override + public List listeners() { + assert (isOwningHandle()); + return Arrays.asList(eventListeners(nativeHandle_)); + } + @Override public Options setEnableThreadTracking(final boolean enableThreadTracking) { assert(isOwningHandle()); @@ -1395,7 +1408,7 @@ public List compressionPerLevel() { final byte[] byteCompressionTypes = compressionPerLevel(nativeHandle_); final List compressionLevels = new ArrayList<>(); - for (final Byte byteCompressionType : byteCompressionTypes) { + for (final byte byteCompressionType : byteCompressionTypes) { compressionLevels.add(CompressionType.getCompressionType( byteCompressionType)); } @@ -2151,6 +2164,9 @@ private native void setStrictBytesPerSync( final long handle, final boolean strictBytesPerSync); private native boolean strictBytesPerSync( final long handle); + private static native void setEventListeners( + final long handle, final long[] eventListenerHandles); + private static native AbstractEventListener[] eventListeners(final long handle); private native void setEnableThreadTracking(long handle, boolean enableThreadTracking); private native boolean enableThreadTracking(long handle); diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java index 1f1510568ff..8e287eb9d7a 100644 --- a/java/src/main/java/org/rocksdb/ReadOptions.java +++ b/java/src/main/java/org/rocksdb/ReadOptions.java @@ -437,16 +437,15 @@ public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) { * * Default: null * - * @param iterateLowerBound Slice representing the upper bound + * @param iterateLowerBound Slice representing the lower bound * @return the reference to the current ReadOptions. */ - public ReadOptions setIterateLowerBound(final Slice iterateLowerBound) { + public ReadOptions setIterateLowerBound(final AbstractSlice iterateLowerBound) { assert(isOwningHandle()); - if (iterateLowerBound != null) { - // Hold onto a reference so it doesn't get garbage collected out from under us. - iterateLowerBoundSlice_ = iterateLowerBound; - setIterateLowerBound(nativeHandle_, iterateLowerBoundSlice_.getNativeHandle()); - } + setIterateLowerBound( + nativeHandle_, iterateLowerBound == null ? 0 : iterateLowerBound.getNativeHandle()); + // Hold onto a reference so it doesn't get garbage collected out from under us. + iterateLowerBoundSlice_ = iterateLowerBound; return this; } @@ -485,13 +484,12 @@ public Slice iterateLowerBound() { * @param iterateUpperBound Slice representing the upper bound * @return the reference to the current ReadOptions. */ - public ReadOptions setIterateUpperBound(final Slice iterateUpperBound) { + public ReadOptions setIterateUpperBound(final AbstractSlice iterateUpperBound) { assert(isOwningHandle()); - if (iterateUpperBound != null) { - // Hold onto a reference so it doesn't get garbage collected out from under us. - iterateUpperBoundSlice_ = iterateUpperBound; - setIterateUpperBound(nativeHandle_, iterateUpperBoundSlice_.getNativeHandle()); - } + setIterateUpperBound( + nativeHandle_, iterateUpperBound == null ? 0 : iterateUpperBound.getNativeHandle()); + // Hold onto a reference so it doesn't get garbage collected out from under us. + iterateUpperBoundSlice_ = iterateUpperBound; return this; } @@ -570,8 +568,8 @@ public long iterStartSeqnum() { // freely leave scope without us losing the Java Slice object, which during // close() would also reap its associated rocksdb::Slice native object since // it's possibly (likely) to be an owning handle. - private Slice iterateLowerBoundSlice_; - private Slice iterateUpperBoundSlice_; + private AbstractSlice iterateLowerBoundSlice_; + private AbstractSlice iterateUpperBoundSlice_; private native static long newReadOptions(); private native static long newReadOptions(final boolean verifyChecksums, diff --git a/java/src/main/java/org/rocksdb/RocksCallbackObject.java b/java/src/main/java/org/rocksdb/RocksCallbackObject.java index a662f78fd70..8d7a867ee7c 100644 --- a/java/src/main/java/org/rocksdb/RocksCallbackObject.java +++ b/java/src/main/java/org/rocksdb/RocksCallbackObject.java @@ -5,6 +5,8 @@ package org.rocksdb; +import java.util.List; + /** * RocksCallbackObject is similar to {@link RocksObject} but varies * in its construction as it is designed for Java objects which have functions @@ -26,6 +28,27 @@ protected RocksCallbackObject(final long... nativeParameterHandles) { this.nativeHandle_ = initializeNative(nativeParameterHandles); } + /** + * Given a list of RocksCallbackObjects, it returns a list + * of the native handles of the underlying objects. + * + * @param objectList the rocks callback objects + * + * @return the native handles + */ + static /* @Nullable */ long[] toNativeHandleList( + /* @Nullable */ final List objectList) { + if (objectList == null) { + return null; + } + final int len = objectList.size(); + final long[] handleList = new long[len]; + for (int i = 0; i < len; i++) { + handleList[i] = objectList.get(i).nativeHandle_; + } + return handleList; + } + /** * Construct the Native C++ object which will callback * to our object methods diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 471a9bbd9e6..bec702faf37 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -38,6 +38,8 @@ private enum LibraryState { RocksDB.loadLibrary(); } + private List ownedColumnFamilyHandles = new ArrayList<>(); + /** * Loads the necessary library files. * Calling this method twice will have no effect. @@ -307,9 +309,12 @@ public static RocksDB open(final DBOptions options, final String path, db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + return db; } @@ -484,9 +489,12 @@ public static RocksDB openReadOnly(final DBOptions options, final String path, db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + return db; } @@ -577,9 +585,12 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); + return db; } @@ -597,6 +608,11 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path * @throws RocksDBException if an error occurs whilst closing. */ public void closeE() throws RocksDBException { + for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -619,6 +635,11 @@ public void closeE() throws RocksDBException { */ @Override public void close() { + for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { + columnFamilyHandle.close(); + } + ownedColumnFamilyHandles.clear(); + if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); @@ -661,10 +682,12 @@ public static List listColumnFamilies(final Options options, public ColumnFamilyHandle createColumnFamily( final ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException { - return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_, - columnFamilyDescriptor.getName(), - columnFamilyDescriptor.getName().length, - columnFamilyDescriptor.getOptions().nativeHandle_)); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, + createColumnFamily(nativeHandle_, columnFamilyDescriptor.getName(), + columnFamilyDescriptor.getName().length, + columnFamilyDescriptor.getOptions().nativeHandle_)); + ownedColumnFamilyHandles.add(columnFamilyHandle); + return columnFamilyHandle; } /** @@ -688,8 +711,10 @@ public List createColumnFamilies( final List columnFamilyHandles = new ArrayList<>(cfHandles.length); for (int i = 0; i < cfHandles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } @@ -719,8 +744,10 @@ public List createColumnFamilies( final List columnFamilyHandles = new ArrayList<>(cfHandles.length); for (int i = 0; i < cfHandles.length; i++) { - columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i])); + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); + columnFamilyHandles.add(columnFamilyHandle); } + ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } @@ -753,7 +780,22 @@ public void dropColumnFamilies( dropColumnFamilies(nativeHandle_, cfHandles); } - //TODO(AR) what about DestroyColumnFamilyHandle + /** + * Deletes native column family handle of given {@link ColumnFamilyHandle} Java object + * and removes reference from {@link RocksDB#ownedColumnFamilyHandles}. + * + * @param columnFamilyHandle column family handle object. + */ + public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) { + for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) { + final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i); + if (ownedHandle.equals(columnFamilyHandle)) { + columnFamilyHandle.close(); + ownedColumnFamilyHandles.remove(i); + return; + } + } + } /** * Set the database entry for "key" to "value". @@ -2504,7 +2546,9 @@ public List multiGetAsList(final ReadOptions opt, /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2528,7 +2572,9 @@ public boolean keyMayExist(final byte[] key, /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2557,7 +2603,9 @@ public boolean keyMayExist(final byte[] key, /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2584,7 +2632,9 @@ public boolean keyMayExist( /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2616,7 +2666,9 @@ public boolean keyMayExist( /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2643,7 +2695,9 @@ public boolean keyMayExist( /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2675,7 +2729,9 @@ public boolean keyMayExist( /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2704,7 +2760,9 @@ public boolean keyMayExist( /** * If the key definitely does not exist in the database, then this method - * returns null, else it returns an instance of KeyMayExistResult + * returns false, otherwise it returns true if the key might exist. + * That is to say that this method is probabilistic and may return false + * positives, but never a true negative. * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. @@ -2792,8 +2850,8 @@ public RocksIterator newIterator(final ReadOptions readOptions) { } /** - *

    Return a heap-allocated iterator over the contents of the - * database. The result of newIterator() is initially invalid + *

    Return a heap-allocated iterator over the contents of a + * ColumnFamily. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

    * @@ -2812,8 +2870,8 @@ public RocksIterator newIterator( } /** - *

    Return a heap-allocated iterator over the contents of the - * database. The result of newIterator() is initially invalid + *

    Return a heap-allocated iterator over the contents of a + * ColumnFamily. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

    * @@ -3532,9 +3590,8 @@ public void setOptions( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle, final MutableColumnFamilyOptions mutableColumnFamilyOptions) throws RocksDBException { - setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_, - mutableColumnFamilyOptions.getKeys(), - mutableColumnFamilyOptions.getValues()); + setOptions(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, + mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues()); } /** @@ -4081,7 +4138,7 @@ public ColumnFamilyMetaData getColumnFamilyMetaData( * * @return the column family metadata */ - public ColumnFamilyMetaData GetColumnFamilyMetaData() { + public ColumnFamilyMetaData getColumnFamilyMetaData() { return getColumnFamilyMetaData(null); } @@ -4479,7 +4536,6 @@ private native void dropColumnFamily( final long handle, final long cfHandle) throws RocksDBException; private native void dropColumnFamilies(final long handle, final long[] cfHandles) throws RocksDBException; - //TODO(AR) best way to express DestroyColumnFamilyHandle? ...maybe in ColumnFamilyHandle? private native void put(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, int valueLength) throws RocksDBException; diff --git a/java/src/main/java/org/rocksdb/Status.java b/java/src/main/java/org/rocksdb/Status.java index e633940c297..033ed3ea1c0 100644 --- a/java/src/main/java/org/rocksdb/Status.java +++ b/java/src/main/java/org/rocksdb/Status.java @@ -5,6 +5,8 @@ package org.rocksdb; +import java.util.Objects; + /** * Represents the status returned by a function call in RocksDB. * @@ -135,4 +137,19 @@ public byte getValue() { return value; } } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Status status = (Status) o; + return code == status.code && subCode == status.subCode && Objects.equals(state, status.state); + } + + @Override + public int hashCode() { + return Objects.hash(code, subCode, state); + } } diff --git a/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java b/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java new file mode 100644 index 00000000000..5a383ade41d --- /dev/null +++ b/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class TableFileCreationBriefInfo { + private final String dbName; + private final String columnFamilyName; + private final String filePath; + private final int jobId; + private final TableFileCreationReason reason; + + /** + * Access is private as this will only be constructed from + * C++ via JNI, either directly of via + * {@link TableFileCreationInfo#TableFileCreationInfo(long, TableProperties, Status, String, + * String, String, int, byte)}. + * + * @param dbName the database name + * @param columnFamilyName the column family name + * @param filePath the path to the table file + * @param jobId the job identifier + * @param tableFileCreationReasonValue the reason for creation of the table file + */ + protected TableFileCreationBriefInfo(final String dbName, final String columnFamilyName, + final String filePath, final int jobId, final byte tableFileCreationReasonValue) { + this.dbName = dbName; + this.columnFamilyName = columnFamilyName; + this.filePath = filePath; + this.jobId = jobId; + this.reason = TableFileCreationReason.fromValue(tableFileCreationReasonValue); + } + + /** + * Get the name of the database where the file was created. + * + * @return the name of the database. + */ + public String getDbName() { + return dbName; + } + + /** + * Get the name of the column family where the file was created. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the path to the created file. + * + * @return the path. + */ + public String getFilePath() { + return filePath; + } + + /** + * Get the id of the job (which could be flush or compaction) that + * created the file. + * + * @return the id of the job. + */ + public int getJobId() { + return jobId; + } + + /** + * Get the reason for creating the table. + * + * @return the reason for creating the table. + */ + public TableFileCreationReason getReason() { + return reason; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o; + return jobId == that.jobId && Objects.equals(dbName, that.dbName) + && Objects.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(filePath, that.filePath) && reason == that.reason; + } + + @Override + public int hashCode() { + return Objects.hash(dbName, columnFamilyName, filePath, jobId, reason); + } + + @Override + public String toString() { + return "TableFileCreationBriefInfo{" + + "dbName='" + dbName + '\'' + ", columnFamilyName='" + columnFamilyName + '\'' + + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + ", reason=" + reason + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/TableFileCreationInfo.java b/java/src/main/java/org/rocksdb/TableFileCreationInfo.java new file mode 100644 index 00000000000..7742f32f19d --- /dev/null +++ b/java/src/main/java/org/rocksdb/TableFileCreationInfo.java @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class TableFileCreationInfo extends TableFileCreationBriefInfo { + private final long fileSize; + private final TableProperties tableProperties; + private final Status status; + + /** + * Access is protected as this will only be constructed from + * C++ via JNI. + * + * @param fileSize the size of the table file + * @param tableProperties the properties of the table file + * @param status the status of the creation operation + * @param dbName the database name + * @param columnFamilyName the column family name + * @param filePath the path to the table file + * @param jobId the job identifier + * @param tableFileCreationReasonValue the reason for creation of the table file + */ + protected TableFileCreationInfo(final long fileSize, final TableProperties tableProperties, + final Status status, final String dbName, final String columnFamilyName, + final String filePath, final int jobId, final byte tableFileCreationReasonValue) { + super(dbName, columnFamilyName, filePath, jobId, tableFileCreationReasonValue); + this.fileSize = fileSize; + this.tableProperties = tableProperties; + this.status = status; + } + + /** + * Get the size of the file. + * + * @return the size. + */ + public long getFileSize() { + return fileSize; + } + + /** + * Get the detailed properties of the created file. + * + * @return the properties. + */ + public TableProperties getTableProperties() { + return tableProperties; + } + + /** + * Get the status indicating whether the creation was successful or not. + * + * @return the status. + */ + public Status getStatus() { + return status; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableFileCreationInfo that = (TableFileCreationInfo) o; + return fileSize == that.fileSize && Objects.equals(tableProperties, that.tableProperties) + && Objects.equals(status, that.status); + } + + @Override + public int hashCode() { + return Objects.hash(fileSize, tableProperties, status); + } + + @Override + public String toString() { + return "TableFileCreationInfo{" + + "fileSize=" + fileSize + ", tableProperties=" + tableProperties + ", status=" + status + + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/TableFileCreationReason.java b/java/src/main/java/org/rocksdb/TableFileCreationReason.java new file mode 100644 index 00000000000..d3984663dd2 --- /dev/null +++ b/java/src/main/java/org/rocksdb/TableFileCreationReason.java @@ -0,0 +1,46 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum TableFileCreationReason { + FLUSH((byte) 0x00), + COMPACTION((byte) 0x01), + RECOVERY((byte) 0x02), + MISC((byte) 0x03); + + private final byte value; + + TableFileCreationReason(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the TableFileCreationReason from the internal representation value. + * + * @return the table file creation reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static TableFileCreationReason fromValue(final byte value) { + for (final TableFileCreationReason tableFileCreationReason : TableFileCreationReason.values()) { + if (tableFileCreationReason.value == value) { + return tableFileCreationReason; + } + } + + throw new IllegalArgumentException( + "Illegal value provided for TableFileCreationReason: " + value); + } +} diff --git a/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java b/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java new file mode 100644 index 00000000000..8aad03ae8fa --- /dev/null +++ b/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class TableFileDeletionInfo { + private final String dbName; + private final String filePath; + private final int jobId; + private final Status status; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + TableFileDeletionInfo( + final String dbName, final String filePath, final int jobId, final Status status) { + this.dbName = dbName; + this.filePath = filePath; + this.jobId = jobId; + this.status = status; + } + + /** + * Get the name of the database where the file was deleted. + * + * @return the name of the database. + */ + public String getDbName() { + return dbName; + } + + /** + * Get the path to the deleted file. + * + * @return the path. + */ + public String getFilePath() { + return filePath; + } + + /** + * Get the id of the job which deleted the file. + * + * @return the id of the job. + */ + public int getJobId() { + return jobId; + } + + /** + * Get the status indicating whether the deletion was successful or not. + * + * @return the status + */ + public Status getStatus() { + return status; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableFileDeletionInfo that = (TableFileDeletionInfo) o; + return jobId == that.jobId && Objects.equals(dbName, that.dbName) + && Objects.equals(filePath, that.filePath) && Objects.equals(status, that.status); + } + + @Override + public int hashCode() { + return Objects.hash(dbName, filePath, jobId, status); + } + + @Override + public String toString() { + return "TableFileDeletionInfo{" + + "dbName='" + dbName + '\'' + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + + ", status=" + status + '}'; + } +} diff --git a/java/src/main/java/org/rocksdb/TableProperties.java b/java/src/main/java/org/rocksdb/TableProperties.java index 8c0b7e370e2..c1baea2a4bc 100644 --- a/java/src/main/java/org/rocksdb/TableProperties.java +++ b/java/src/main/java/org/rocksdb/TableProperties.java @@ -1,7 +1,9 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb; +import java.util.Arrays; import java.util.Map; +import java.util.Objects; /** * TableProperties contains read-only properties of its associated @@ -27,6 +29,8 @@ public class TableProperties { private final long columnFamilyId; private final long creationTime; private final long oldestKeyTime; + private final long slowCompressionEstimatedDataSize; + private final long fastCompressionEstimatedDataSize; private final byte[] columnFamilyName; private final String filterPolicyName; private final String comparatorName; @@ -39,24 +43,22 @@ public class TableProperties { private final Map propertiesOffsets; /** - * Access is private as this will only be constructed from - * C++ via JNI. + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. */ - private TableProperties(final long dataSize, final long indexSize, - final long indexPartitions, final long topLevelIndexSize, - final long indexKeyIsUserKey, final long indexValueIsDeltaEncoded, - final long filterSize, final long rawKeySize, final long rawValueSize, - final long numDataBlocks, final long numEntries, final long numDeletions, - final long numMergeOperands, final long numRangeDeletions, - final long formatVersion, final long fixedKeyLen, - final long columnFamilyId, final long creationTime, - final long oldestKeyTime, final byte[] columnFamilyName, - final String filterPolicyName, final String comparatorName, + TableProperties(final long dataSize, final long indexSize, final long indexPartitions, + final long topLevelIndexSize, final long indexKeyIsUserKey, + final long indexValueIsDeltaEncoded, final long filterSize, final long rawKeySize, + final long rawValueSize, final long numDataBlocks, final long numEntries, + final long numDeletions, final long numMergeOperands, final long numRangeDeletions, + final long formatVersion, final long fixedKeyLen, final long columnFamilyId, + final long creationTime, final long oldestKeyTime, + final long slowCompressionEstimatedDataSize, final long fastCompressionEstimatedDataSize, + final byte[] columnFamilyName, final String filterPolicyName, final String comparatorName, final String mergeOperatorName, final String prefixExtractorName, final String propertyCollectorsNames, final String compressionName, final Map userCollectedProperties, - final Map readableProperties, - final Map propertiesOffsets) { + final Map readableProperties, final Map propertiesOffsets) { this.dataSize = dataSize; this.indexSize = indexSize; this.indexPartitions = indexPartitions; @@ -76,6 +78,8 @@ private TableProperties(final long dataSize, final long indexSize, this.columnFamilyId = columnFamilyId; this.creationTime = creationTime; this.oldestKeyTime = oldestKeyTime; + this.slowCompressionEstimatedDataSize = slowCompressionEstimatedDataSize; + this.fastCompressionEstimatedDataSize = fastCompressionEstimatedDataSize; this.columnFamilyName = columnFamilyName; this.filterPolicyName = filterPolicyName; this.comparatorName = comparatorName; @@ -268,6 +272,26 @@ public long getOldestKeyTime() { return oldestKeyTime; } + /** + * Get the estimated size of data blocks compressed with a relatively slower + * compression algorithm. + * + * @return 0 means unknown, otherwise the timestamp. + */ + public long getSlowCompressionEstimatedDataSize() { + return slowCompressionEstimatedDataSize; + } + + /** + * Get the estimated size of data blocks compressed with a relatively faster + * compression algorithm. + * + * @return 0 means unknown, otherwise the timestamp. + */ + public long getFastCompressionEstimatedDataSize() { + return fastCompressionEstimatedDataSize; + } + /** * Get the name of the column family with which this * SST file is associated. @@ -363,4 +387,49 @@ public Map getReadableProperties() { public Map getPropertiesOffsets() { return propertiesOffsets; } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TableProperties that = (TableProperties) o; + return dataSize == that.dataSize && indexSize == that.indexSize + && indexPartitions == that.indexPartitions && topLevelIndexSize == that.topLevelIndexSize + && indexKeyIsUserKey == that.indexKeyIsUserKey + && indexValueIsDeltaEncoded == that.indexValueIsDeltaEncoded + && filterSize == that.filterSize && rawKeySize == that.rawKeySize + && rawValueSize == that.rawValueSize && numDataBlocks == that.numDataBlocks + && numEntries == that.numEntries && numDeletions == that.numDeletions + && numMergeOperands == that.numMergeOperands && numRangeDeletions == that.numRangeDeletions + && formatVersion == that.formatVersion && fixedKeyLen == that.fixedKeyLen + && columnFamilyId == that.columnFamilyId && creationTime == that.creationTime + && oldestKeyTime == that.oldestKeyTime + && slowCompressionEstimatedDataSize == that.slowCompressionEstimatedDataSize + && fastCompressionEstimatedDataSize == that.fastCompressionEstimatedDataSize + && Arrays.equals(columnFamilyName, that.columnFamilyName) + && Objects.equals(filterPolicyName, that.filterPolicyName) + && Objects.equals(comparatorName, that.comparatorName) + && Objects.equals(mergeOperatorName, that.mergeOperatorName) + && Objects.equals(prefixExtractorName, that.prefixExtractorName) + && Objects.equals(propertyCollectorsNames, that.propertyCollectorsNames) + && Objects.equals(compressionName, that.compressionName) + && Objects.equals(userCollectedProperties, that.userCollectedProperties) + && Objects.equals(readableProperties, that.readableProperties) + && Objects.equals(propertiesOffsets, that.propertiesOffsets); + } + + @Override + public int hashCode() { + int result = Objects.hash(dataSize, indexSize, indexPartitions, topLevelIndexSize, + indexKeyIsUserKey, indexValueIsDeltaEncoded, filterSize, rawKeySize, rawValueSize, + numDataBlocks, numEntries, numDeletions, numMergeOperands, numRangeDeletions, formatVersion, + fixedKeyLen, columnFamilyId, creationTime, oldestKeyTime, slowCompressionEstimatedDataSize, + fastCompressionEstimatedDataSize, filterPolicyName, comparatorName, mergeOperatorName, + prefixExtractorName, propertyCollectorsNames, compressionName, userCollectedProperties, + readableProperties, propertiesOffsets); + result = 31 * result + Arrays.hashCode(columnFamilyName); + return result; + } } diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 7a37f35b9e1..0d6cc5a92eb 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -742,6 +742,28 @@ public enum TickerType { COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14), COMPACT_WRITE_BYTES_TTL((byte) -0x15), + /** + * DB error handler statistics + */ + ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16), + ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18), + ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19), + ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A), + ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B), + + /** + * Bytes of raw data (payload) found on memtable at flush time. + * Contains the sum of garbage payload (bytes that are discarded + * at flush time) and useful payload (bytes of data that will + * eventually be written to SSTable). + */ + MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x1C), + /** + * Outdated bytes of data present on memtable at flush time. + */ + MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x1D), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java index f176701fa01..768329a675b 100644 --- a/java/src/main/java/org/rocksdb/Transaction.java +++ b/java/src/main/java/org/rocksdb/Transaction.java @@ -611,9 +611,9 @@ public RocksIterator getIterator(final ReadOptions readOptions) { } /** - * Returns an iterator that will iterate on all keys in the default - * column family including both keys in the DB and uncommitted keys in this - * transaction. + * Returns an iterator that will iterate on all keys in the column family + * specified by {@code columnFamilyHandle} including both keys in the DB + * and uncommitted keys in this transaction. * * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read * from the DB but will NOT change which keys are read from this transaction diff --git a/java/src/main/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java index c1e3bb473a9..a7adaf4b222 100644 --- a/java/src/main/java/org/rocksdb/TtlDB.java +++ b/java/src/main/java/org/rocksdb/TtlDB.java @@ -113,7 +113,7 @@ public static TtlDB open(final DBOptions options, final String db_path, throws RocksDBException { if (columnFamilyDescriptors.size() != ttlValues.size()) { throw new IllegalArgumentException("There must be a ttl value per column" - + "family handle."); + + " family handle."); } final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; diff --git a/java/src/main/java/org/rocksdb/WriteStallCondition.java b/java/src/main/java/org/rocksdb/WriteStallCondition.java new file mode 100644 index 00000000000..3bc9d410431 --- /dev/null +++ b/java/src/main/java/org/rocksdb/WriteStallCondition.java @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum WriteStallCondition { + NORMAL((byte) 0x0), + DELAYED((byte) 0x1), + STOPPED((byte) 0x2); + + private final byte value; + + WriteStallCondition(final byte value) { + this.value = value; + } + + /** + * Get the internal representation. + * + * @return the internal representation + */ + byte getValue() { + return value; + } + + /** + * Get the WriteStallCondition from the internal representation value. + * + * @return the flush reason. + * + * @throws IllegalArgumentException if the value is unknown. + */ + static WriteStallCondition fromValue(final byte value) { + for (final WriteStallCondition writeStallCondition : WriteStallCondition.values()) { + if (writeStallCondition.value == value) { + return writeStallCondition; + } + } + + throw new IllegalArgumentException("Illegal value provided for WriteStallCondition: " + value); + } +} diff --git a/java/src/main/java/org/rocksdb/WriteStallInfo.java b/java/src/main/java/org/rocksdb/WriteStallInfo.java new file mode 100644 index 00000000000..4aef0eda9ad --- /dev/null +++ b/java/src/main/java/org/rocksdb/WriteStallInfo.java @@ -0,0 +1,75 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.util.Objects; + +public class WriteStallInfo { + private final String columnFamilyName; + private final WriteStallCondition currentCondition; + private final WriteStallCondition previousCondition; + + /** + * Access is package private as this will only be constructed from + * C++ via JNI and for testing. + */ + WriteStallInfo(final String columnFamilyName, final byte currentConditionValue, + final byte previousConditionValue) { + this.columnFamilyName = columnFamilyName; + this.currentCondition = WriteStallCondition.fromValue(currentConditionValue); + this.previousCondition = WriteStallCondition.fromValue(previousConditionValue); + } + + /** + * Get the name of the column family. + * + * @return the name of the column family. + */ + public String getColumnFamilyName() { + return columnFamilyName; + } + + /** + * Get the current state of the write controller. + * + * @return the current state. + */ + public WriteStallCondition getCurrentCondition() { + return currentCondition; + } + + /** + * Get the previous state of the write controller. + * + * @return the previous state. + */ + public WriteStallCondition getPreviousCondition() { + return previousCondition; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + WriteStallInfo that = (WriteStallInfo) o; + return Objects.equals(columnFamilyName, that.columnFamilyName) + && currentCondition == that.currentCondition && previousCondition == that.previousCondition; + } + + @Override + public int hashCode() { + return Objects.hash(columnFamilyName, currentCondition, previousCondition); + } + + @Override + public String toString() { + return "WriteStallInfo{" + + "columnFamilyName='" + columnFamilyName + '\'' + ", currentCondition=" + currentCondition + + ", previousCondition=" + previousCondition + '}'; + } +} diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java index a9a08763523..9fab479b272 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -5,16 +5,17 @@ package org.rocksdb; -import java.util.*; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import java.util.*; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.assertj.core.api.Assertions.assertThat; - public class ColumnFamilyTest { @ClassRule @@ -141,33 +142,19 @@ public void openWithColumnFamilies() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { - - try { - assertThat(columnFamilyHandleList.size()).isEqualTo(2); - db.put("dfkey1".getBytes(), "dfvalue".getBytes()); - db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), - "dfvalue".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), - "newcfvalue".getBytes()); - - String retVal = new String(db.get(columnFamilyHandleList.get(1), - "newcfkey1".getBytes())); - assertThat(retVal).isEqualTo("newcfvalue"); - assertThat((db.get(columnFamilyHandleList.get(1), - "dfkey1".getBytes()))).isNull(); - db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); - assertThat((db.get(columnFamilyHandleList.get(1), - "newcfkey1".getBytes()))).isNull(); - db.delete(columnFamilyHandleList.get(0), new WriteOptions(), - "dfkey2".getBytes()); - assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), - "dfkey2".getBytes())).isNull(); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + assertThat(columnFamilyHandleList.size()).isEqualTo(2); + db.put("dfkey1".getBytes(), "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), "newcfvalue".getBytes()); + + String retVal = new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes())); + assertThat(retVal).isEqualTo("newcfvalue"); + assertThat((db.get(columnFamilyHandleList.get(1), "dfkey1".getBytes()))).isNull(); + db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); + assertThat((db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()))).isNull(); + db.delete(columnFamilyHandleList.get(0), new WriteOptions(), "dfkey2".getBytes()); + assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), "dfkey2".getBytes())) + .isNull(); } } @@ -184,30 +171,22 @@ public void getWithOutValueAndCf() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.put(columnFamilyHandleList.get(0), new WriteOptions(), - "key1".getBytes(), "value".getBytes()); - db.put("key2".getBytes(), "12345678".getBytes()); - final byte[] outValue = new byte[5]; - // not found value - int getResult = db.get("keyNotFound".getBytes(), outValue); - assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); - // found value which fits in outValue - getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), - outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("value".getBytes()); - // found value which fits partially - getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), - "key2".getBytes(), outValue); - assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); - assertThat(outValue).isEqualTo("12345".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.put( + columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + final byte[] outValue = new byte[5]; + // not found value + int getResult = db.get("keyNotFound".getBytes(), outValue); + assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); + // found value which fits in outValue + getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("value".getBytes()); + // found value which fits partially + getResult = + db.get(columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("12345".getBytes()); } } @@ -223,22 +202,12 @@ public void createWriteDropColumnFamily() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - ColumnFamilyHandle tmpColumnFamilyHandle = null; - try { - tmpColumnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF".getBytes(), - new ColumnFamilyOptions())); - db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); - db.dropColumnFamily(tmpColumnFamilyHandle); - assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); - } finally { - if (tmpColumnFamilyHandle != null) { - tmpColumnFamilyHandle.close(); - } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + ColumnFamilyHandle tmpColumnFamilyHandle; + tmpColumnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); + db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); + db.dropColumnFamily(tmpColumnFamilyHandle); + assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); } } @@ -256,29 +225,15 @@ public void createWriteDropColumnFamilies() throws RocksDBException { columnFamilyHandleList)) { ColumnFamilyHandle tmpColumnFamilyHandle = null; ColumnFamilyHandle tmpColumnFamilyHandle2 = null; - try { - tmpColumnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF".getBytes(), - new ColumnFamilyOptions())); - tmpColumnFamilyHandle2 = db.createColumnFamily( - new ColumnFamilyDescriptor("tmpCF2".getBytes(), - new ColumnFamilyOptions())); - db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); - db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes()); - db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2)); - assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); - assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue(); - } finally { - if (tmpColumnFamilyHandle != null) { - tmpColumnFamilyHandle.close(); - } - if (tmpColumnFamilyHandle2 != null) { - tmpColumnFamilyHandle2.close(); - } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + tmpColumnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); + tmpColumnFamilyHandle2 = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF2".getBytes(), new ColumnFamilyOptions())); + db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); + db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes()); + db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2)); + assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue(); + assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue(); } } @@ -300,36 +255,24 @@ public void writeBatch() throws RocksDBException { cfDescriptors, columnFamilyHandleList); final WriteBatch writeBatch = new WriteBatch(); final WriteOptions writeOpt = new WriteOptions()) { - try { - writeBatch.put("key".getBytes(), "value".getBytes()); - writeBatch.put(db.getDefaultColumnFamily(), - "mergeKey".getBytes(), "merge".getBytes()); - writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), - "merge".getBytes()); - writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), - "value2".getBytes()); - writeBatch.delete("xyz".getBytes()); - writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes()); - db.write(writeOpt, writeBatch); - - assertThat(db.get(columnFamilyHandleList.get(1), - "xyz".getBytes()) == null); - assertThat(new String(db.get(columnFamilyHandleList.get(1), - "newcfkey".getBytes()))).isEqualTo("value"); - assertThat(new String(db.get(columnFamilyHandleList.get(1), - "newcfkey2".getBytes()))).isEqualTo("value2"); - assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); - // check if key is merged - assertThat(new String(db.get(db.getDefaultColumnFamily(), - "mergeKey".getBytes()))).isEqualTo("merge,merge"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + writeBatch.put("key".getBytes(), "value".getBytes()); + writeBatch.put(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes()); + writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes()); + writeBatch.delete("xyz".getBytes()); + writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes()); + db.write(writeOpt, writeBatch); + + assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes()) == null); + assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey".getBytes()))) + .isEqualTo("value"); + assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey2".getBytes()))) + .isEqualTo("value2"); + assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); + // check if key is merged + assertThat(new String(db.get(db.getDefaultColumnFamily(), "mergeKey".getBytes()))) + .isEqualTo("merge,merge"); } } } @@ -346,32 +289,21 @@ public void iteratorOnColumnFamily() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), - "value2".getBytes()); - try (final RocksIterator rocksIterator = - db.newIterator(columnFamilyHandleList.get(1))) { - rocksIterator.seekToFirst(); - Map refMap = new HashMap<>(); - refMap.put("newcfkey", "value"); - refMap.put("newcfkey2", "value2"); - int i = 0; - while (rocksIterator.isValid()) { - i++; - assertThat(refMap.get(new String(rocksIterator.key()))). - isEqualTo(new String(rocksIterator.value())); - rocksIterator.next(); - } - assertThat(i).isEqualTo(2); - } - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes()); + try (final RocksIterator rocksIterator = db.newIterator(columnFamilyHandleList.get(1))) { + rocksIterator.seekToFirst(); + Map refMap = new HashMap<>(); + refMap.put("newcfkey", "value"); + refMap.put("newcfkey2", "value2"); + int i = 0; + while (rocksIterator.isValid()) { + i++; + assertThat(refMap.get(new String(rocksIterator.key()))) + .isEqualTo(new String(rocksIterator.value())); + rocksIterator.next(); } + assertThat(i).isEqualTo(2); } } } @@ -388,35 +320,20 @@ public void multiGet() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.put(columnFamilyHandleList.get(0), "key".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - - final List keys = Arrays.asList(new byte[][]{ - "key".getBytes(), "newcfkey".getBytes() - }); - - List retValues = db.multiGetAsList(columnFamilyHandleList, keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, - keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + + final List keys = + Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()}); + + List retValues = db.multiGetAsList(columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); + retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); } } @@ -432,35 +349,19 @@ public void multiGetAsList() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.put(columnFamilyHandleList.get(0), "key".getBytes(), - "value".getBytes()); - db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), - "value".getBytes()); - - final List keys = Arrays.asList(new byte[][]{ - "key".getBytes(), "newcfkey".getBytes() - }); - List retValues = db.multiGetAsList(columnFamilyHandleList, - keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, - keys); - assertThat(retValues.size()).isEqualTo(2); - assertThat(new String(retValues.get(0))) - .isEqualTo("value"); - assertThat(new String(retValues.get(1))) - .isEqualTo("value"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + + final List keys = + Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()}); + List retValues = db.multiGetAsList(columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); + retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(0))).isEqualTo("value"); + assertThat(new String(retValues.get(1))).isEqualTo("value"); } } @@ -476,30 +377,18 @@ public void properties() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - assertThat(db.getProperty("rocksdb.estimate-num-keys")). - isNotNull(); - assertThat(db.getLongProperty(columnFamilyHandleList.get(0), - "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0); - assertThat(db.getProperty("rocksdb.stats")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(0), - "rocksdb.sstables")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.estimate-num-keys")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.stats")).isNotNull(); - assertThat(db.getProperty(columnFamilyHandleList.get(1), - "rocksdb.sstables")).isNotNull(); - assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")). - isNotNull(); - assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")). - isGreaterThanOrEqualTo(0); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + assertThat(db.getProperty("rocksdb.estimate-num-keys")).isNotNull(); + assertThat(db.getLongProperty(columnFamilyHandleList.get(0), "rocksdb.estimate-num-keys")) + .isGreaterThanOrEqualTo(0); + assertThat(db.getProperty("rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(0), "rocksdb.sstables")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.estimate-num-keys")) + .isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.sstables")).isNotNull(); + assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).isNotNull(); + assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")) + .isGreaterThanOrEqualTo(0); } } @@ -547,10 +436,6 @@ public void iterators() throws RocksDBException { rocksIterator.close(); } } - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } } } } @@ -566,15 +451,8 @@ public void failPutDisposedCF() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.put(columnFamilyHandleList.get(1), "key".getBytes(), - "value".getBytes()); - } finally { - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes()); } } @@ -589,15 +467,8 @@ public void failRemoveDisposedCF() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.delete(columnFamilyHandleList.get(1), "key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.delete(columnFamilyHandleList.get(1), "key".getBytes()); } } @@ -612,15 +483,8 @@ public void failGetDisposedCF() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - db.dropColumnFamily(columnFamilyHandleList.get(1)); - db.get(columnFamilyHandleList.get(1), "key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.get(columnFamilyHandleList.get(1), "key".getBytes()); } } @@ -635,19 +499,11 @@ public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - try { - final List keys = new ArrayList<>(); - keys.add("key".getBytes()); - keys.add("newcfkey".getBytes()); - final List cfCustomList = new ArrayList<>(); - db.multiGetAsList(cfCustomList, keys); - - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + final List keys = new ArrayList<>(); + keys.add("key".getBytes()); + keys.add("newcfkey".getBytes()); + final List cfCustomList = new ArrayList<>(); + db.multiGetAsList(cfCustomList, keys); } } @@ -661,25 +517,12 @@ public void testByteCreateFolumnFamily() throws RocksDBException { final byte[] b0 = new byte[]{(byte) 0x00}; final byte[] b1 = new byte[]{(byte) 0x01}; final byte[] b2 = new byte[]{(byte) 0x02}; - ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null; - try { - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); - final List families = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), b0, b1); - cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2)); - } finally { - if (cf1 != null) { - cf1.close(); - } - if (cf2 != null) { - cf2.close(); - } - if (cf3 != null) { - cf3.close(); - } - } + db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + final List families = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), b0, b1); + db.createColumnFamily(new ColumnFamilyDescriptor(b2)); } } @@ -690,22 +533,13 @@ public void testCFNamesWithZeroBytes() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); ) { - try { - final byte[] b0 = new byte[]{0, 0}; - final byte[] b1 = new byte[]{0, 1}; - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); - final List families = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), b0, b1); - } finally { - if (cf1 != null) { - cf1.close(); - } - if (cf2 != null) { - cf2.close(); - } - } + final byte[] b0 = new byte[] {0, 0}; + final byte[] b1 = new byte[] {0, 1}; + cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + final List families = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), b0, b1); } } @@ -716,17 +550,57 @@ public void testCFNameSimplifiedChinese() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); ) { + final String simplifiedChinese = "\u7b80\u4f53\u5b57"; + columnFamilyHandle = + db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); + + final List families = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes()); + } + } + + @Test + public void testDestroyColumnFamilyHandle() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());) { + final byte[] name1 = "cf1".getBytes(); + final byte[] name2 = "cf2".getBytes(); + final ColumnFamilyDescriptor desc1 = new ColumnFamilyDescriptor(name1); + final ColumnFamilyDescriptor desc2 = new ColumnFamilyDescriptor(name2); + final ColumnFamilyHandle cf1 = db.createColumnFamily(desc1); + final ColumnFamilyHandle cf2 = db.createColumnFamily(desc2); + assertTrue(cf1.isOwningHandle()); + assertTrue(cf2.isOwningHandle()); + assertFalse(cf1.isDefaultColumnFamily()); + db.destroyColumnFamilyHandle(cf1); + // At this point cf1 should not be used! + assertFalse(cf1.isOwningHandle()); + assertTrue(cf2.isOwningHandle()); + } + } + + @Test + @Deprecated + /** + * @deprecated Now explicitly closing instances of ColumnFamilyHandle is not required. + * RocksDB instance will take care of closing its associated ColumnFamilyHandle objects. + */ + public void testColumnFamilyCloseBeforeDb() throws RocksDBException { + final List cfNames = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final List columnFamilyHandleList = new ArrayList<>(); + + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final RocksDB db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) { try { - final String simplifiedChinese = "\u7b80\u4f53\u5b57"; - columnFamilyHandle = db.createColumnFamily( - new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); - - final List families = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); - assertThat(families).contains("default".getBytes(), - simplifiedChinese.getBytes()); + db.put("testKey".getBytes(), "tstValue".getBytes()); + // Do something... } finally { - if (columnFamilyHandle != null) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { columnFamilyHandle.close(); } } diff --git a/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java b/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java index e05f1eef3a7..35a14eb5490 100644 --- a/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java +++ b/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java @@ -39,29 +39,22 @@ public void columnFamilyOptions_setCompactionFilterFactory() final List cfHandles = new ArrayList<>(); - try (final RocksDB rocksDb = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles); - ) { - try { - final byte[] key1 = "key1".getBytes(); - final byte[] key2 = "key2".getBytes(); + try (final RocksDB rocksDb = + RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles)) { + final byte[] key1 = "key1".getBytes(); + final byte[] key2 = "key2".getBytes(); - final byte[] value1 = "value1".getBytes(); - final byte[] value2 = new byte[0]; + final byte[] value1 = "value1".getBytes(); + final byte[] value2 = new byte[0]; - rocksDb.put(cfHandles.get(1), key1, value1); - rocksDb.put(cfHandles.get(1), key2, value2); + rocksDb.put(cfHandles.get(1), key1, value1); + rocksDb.put(cfHandles.get(1), key2, value2); - rocksDb.compactRange(cfHandles.get(1)); + rocksDb.compactRange(cfHandles.get(1)); - assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1); - final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null); - assertThat(exists).isFalse(); - } finally { - for (final ColumnFamilyHandle cfHandle : cfHandles) { - cfHandle.close(); - } - } + assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1); + final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null); + assertThat(exists).isFalse(); } } } diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java index 32e5aa21ce5..17964aaef1e 100644 --- a/java/src/test/java/org/rocksdb/DBOptionsTest.java +++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -5,13 +5,16 @@ package org.rocksdb; -import org.junit.ClassRule; -import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; import java.nio.file.Paths; import java.util.*; - -import static org.assertj.core.api.Assertions.assertThat; +import java.util.concurrent.atomic.AtomicBoolean; +import org.junit.ClassRule; +import org.junit.Test; public class DBOptionsTest { @@ -895,4 +898,38 @@ public void skipCheckingSstFileSizesOnDbOpen() { assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true); } } + + @Test + public void eventListeners() { + final AtomicBoolean wasCalled1 = new AtomicBoolean(); + final AtomicBoolean wasCalled2 = new AtomicBoolean(); + try (final DBOptions options = new DBOptions(); + final AbstractEventListener el1 = + new AbstractEventListener() { + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + wasCalled1.set(true); + } + }; + final AbstractEventListener el2 = + new AbstractEventListener() { + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + wasCalled2.set(true); + } + }) { + assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options); + List listeners = options.listeners(); + assertEquals(el1, listeners.get(0)); + assertEquals(el2, listeners.get(1)); + options.setListeners(Collections.emptyList()); + listeners.get(0).onTableFileDeleted(null); + assertTrue(wasCalled1.get()); + listeners.get(1).onMemTableSealed(null); + assertTrue(wasCalled2.get()); + List listeners2 = options.listeners(); + assertNotNull(listeners2); + assertEquals(0, listeners2.size()); + } + } } diff --git a/java/src/test/java/org/rocksdb/EventListenerTest.java b/java/src/test/java/org/rocksdb/EventListenerTest.java new file mode 100644 index 00000000000..61193ff6765 --- /dev/null +++ b/java/src/test/java/org/rocksdb/EventListenerTest.java @@ -0,0 +1,765 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.*; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.concurrent.atomic.AtomicBoolean; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.AbstractEventListener.EnabledEventCallback; +import org.rocksdb.test.TestableEventListener; + +public class EventListenerTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory(); + + void flushDb(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + db.flush(new FlushOptions()); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onFlushCompleted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onFlushCompletedListener = new AbstractEventListener() { + @Override + public void onFlushCompleted(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) { + assertNotNull(flushJobInfo.getColumnFamilyName()); + assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason()); + wasCbCalled.set(true); + } + }; + flushDb(onFlushCompletedListener, wasCbCalled); + } + + @Test + public void onFlushBegin() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onFlushBeginListener = new AbstractEventListener() { + @Override + public void onFlushBegin(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) { + assertNotNull(flushJobInfo.getColumnFamilyName()); + assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason()); + wasCbCalled.set(true); + } + }; + flushDb(onFlushBeginListener, wasCbCalled); + } + + void deleteTableFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + final RocksDB.LiveFiles liveFiles = db.getLiveFiles(); + assertNotNull(liveFiles); + assertNotNull(liveFiles.files); + assertFalse(liveFiles.files.isEmpty()); + db.deleteFile(liveFiles.files.get(0)); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onTableFileDeleted() throws RocksDBException, InterruptedException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onTableFileDeletedListener = new AbstractEventListener() { + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + assertNotNull(tableFileDeletionInfo.getDbName()); + wasCbCalled.set(true); + } + }; + deleteTableFile(onTableFileDeletedListener, wasCbCalled); + } + + void compactRange(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + db.compactRange(); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onCompactionBegin() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onCompactionBeginListener = new AbstractEventListener() { + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason()); + wasCbCalled.set(true); + } + }; + compactRange(onCompactionBeginListener, wasCbCalled); + } + + @Test + public void onCompactionCompleted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onCompactionCompletedListener = new AbstractEventListener() { + @Override + public void onCompactionCompleted( + final RocksDB db, final CompactionJobInfo compactionJobInfo) { + assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason()); + wasCbCalled.set(true); + } + }; + compactRange(onCompactionCompletedListener, wasCbCalled); + } + + @Test + public void onTableFileCreated() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onTableFileCreatedListener = new AbstractEventListener() { + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + assertEquals(TableFileCreationReason.FLUSH, tableFileCreationInfo.getReason()); + wasCbCalled.set(true); + } + }; + flushDb(onTableFileCreatedListener, wasCbCalled); + } + + @Test + public void onTableFileCreationStarted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onTableFileCreationStartedListener = new AbstractEventListener() { + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + assertEquals(TableFileCreationReason.FLUSH, tableFileCreationBriefInfo.getReason()); + wasCbCalled.set(true); + } + }; + flushDb(onTableFileCreationStartedListener, wasCbCalled); + } + + void deleteColumnFamilyHandle(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final byte[] value = new byte[24]; + rand.nextBytes(value); + db.put("testKey".getBytes(), value); + ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily(); + columnFamilyHandle.close(); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onColumnFamilyHandleDeletionStarted() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onColumnFamilyHandleDeletionStartedListener = + new AbstractEventListener() { + @Override + public void onColumnFamilyHandleDeletionStarted( + final ColumnFamilyHandle columnFamilyHandle) { + assertNotNull(columnFamilyHandle); + wasCbCalled.set(true); + } + }; + deleteColumnFamilyHandle(onColumnFamilyHandleDeletionStartedListener, wasCbCalled); + } + + void ingestExternalFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled) + throws RocksDBException { + try (final Options opt = + new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el)); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + assertThat(db).isNotNull(); + final String uuid = UUID.randomUUID().toString(); + final SstFileWriter sstFileWriter = new SstFileWriter(new EnvOptions(), opt); + final Path externalFilePath = Paths.get(db.getName(), uuid); + sstFileWriter.open(externalFilePath.toString()); + sstFileWriter.put("testKey".getBytes(), uuid.getBytes()); + sstFileWriter.finish(); + db.ingestExternalFile( + Collections.singletonList(externalFilePath.toString()), new IngestExternalFileOptions()); + assertTrue(wasCbCalled.get()); + } + } + + @Test + public void onExternalFileIngested() throws RocksDBException { + final AtomicBoolean wasCbCalled = new AtomicBoolean(); + final AbstractEventListener onExternalFileIngestedListener = new AbstractEventListener() { + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + assertNotNull(db); + wasCbCalled.set(true); + } + }; + ingestExternalFile(onExternalFileIngestedListener, wasCbCalled); + } + + @Test + public void testAllCallbacksInvocation() { + final int TEST_INT_VAL = -1; + final long TEST_LONG_VAL = -1; + // Expected test data objects + final Map userCollectedPropertiesTestData = + Collections.singletonMap("key", "value"); + final Map readablePropertiesTestData = Collections.singletonMap("key", "value"); + final Map propertiesOffsetsTestData = + Collections.singletonMap("key", TEST_LONG_VAL); + final TableProperties tablePropertiesTestData = new TableProperties(TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, + TEST_LONG_VAL, TEST_LONG_VAL, "columnFamilyName".getBytes(), "filterPolicyName", + "comparatorName", "mergeOperatorName", "prefixExtractorName", "propertyCollectorsNames", + "compressionName", userCollectedPropertiesTestData, readablePropertiesTestData, + propertiesOffsetsTestData); + final FlushJobInfo flushJobInfoTestData = new FlushJobInfo(Integer.MAX_VALUE, + "testColumnFamily", "/file/path", TEST_LONG_VAL, Integer.MAX_VALUE, true, true, + TEST_LONG_VAL, TEST_LONG_VAL, tablePropertiesTestData, (byte) 0x0a); + final Status statusTestData = new Status(Status.Code.Incomplete, Status.SubCode.NoSpace, null); + final TableFileDeletionInfo tableFileDeletionInfoTestData = + new TableFileDeletionInfo("dbName", "/file/path", Integer.MAX_VALUE, statusTestData); + final TableFileCreationInfo tableFileCreationInfoTestData = + new TableFileCreationInfo(TEST_LONG_VAL, tablePropertiesTestData, statusTestData, "dbName", + "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03); + final TableFileCreationBriefInfo tableFileCreationBriefInfoTestData = + new TableFileCreationBriefInfo( + "dbName", "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03); + final MemTableInfo memTableInfoTestData = new MemTableInfo( + "columnFamilyName", TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL); + final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path", + TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData); + final WriteStallInfo writeStallInfoTestData = + new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2); + final ExternalFileIngestionInfo externalFileIngestionInfoTestData = + new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path", + "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData); + + final CapturingTestableEventListener listener = new CapturingTestableEventListener() { + @Override + public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) { + super.onFlushCompleted(db, flushJobInfo); + assertEquals(flushJobInfoTestData, flushJobInfo); + } + + @Override + public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { + super.onFlushBegin(db, flushJobInfo); + assertEquals(flushJobInfoTestData, flushJobInfo); + } + + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + super.onTableFileDeleted(tableFileDeletionInfo); + assertEquals(tableFileDeletionInfoTestData, tableFileDeletionInfo); + } + + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + super.onCompactionBegin(db, compactionJobInfo); + assertArrayEquals( + "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName()); + assertEquals(statusTestData, compactionJobInfo.status()); + assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel()); + assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles()); + assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles()); + assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData), + compactionJobInfo.tableProperties()); + assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason()); + assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression()); + } + + @Override + public void onCompactionCompleted( + final RocksDB db, final CompactionJobInfo compactionJobInfo) { + super.onCompactionCompleted(db, compactionJobInfo); + assertArrayEquals( + "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName()); + assertEquals(statusTestData, compactionJobInfo.status()); + assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel()); + assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel()); + assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles()); + assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles()); + assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData), + compactionJobInfo.tableProperties()); + assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason()); + assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression()); + } + + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + super.onTableFileCreated(tableFileCreationInfo); + assertEquals(tableFileCreationInfoTestData, tableFileCreationInfo); + } + + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + super.onTableFileCreationStarted(tableFileCreationBriefInfo); + assertEquals(tableFileCreationBriefInfoTestData, tableFileCreationBriefInfo); + } + + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + super.onMemTableSealed(memTableInfo); + assertEquals(memTableInfoTestData, memTableInfo); + } + + @Override + public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) { + super.onColumnFamilyHandleDeletionStarted(columnFamilyHandle); + } + + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + super.onExternalFileIngested(db, externalFileIngestionInfo); + assertEquals(externalFileIngestionInfoTestData, externalFileIngestionInfo); + } + + @Override + public void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + super.onBackgroundError(backgroundErrorReason, backgroundError); + } + + @Override + public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) { + super.onStallConditionsChanged(writeStallInfo); + assertEquals(writeStallInfoTestData, writeStallInfo); + } + + @Override + public void onFileReadFinish(final FileOperationInfo fileOperationInfo) { + super.onFileReadFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) { + super.onFileWriteFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) { + super.onFileFlushFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) { + super.onFileSyncFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) { + super.onFileRangeSyncFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) { + assertEquals(fileOperationInfoTestData, fileOperationInfo); + super.onFileTruncateFinish(fileOperationInfo); + } + + @Override + public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) { + super.onFileCloseFinish(fileOperationInfo); + assertEquals(fileOperationInfoTestData, fileOperationInfo); + } + + @Override + public boolean shouldBeNotifiedOnFileIO() { + super.shouldBeNotifiedOnFileIO(); + return false; + } + + @Override + public boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + super.onErrorRecoveryBegin(backgroundErrorReason, backgroundError); + assertEquals(BackgroundErrorReason.FLUSH, backgroundErrorReason); + assertEquals(statusTestData, backgroundError); + return true; + } + + @Override + public void onErrorRecoveryCompleted(final Status oldBackgroundError) { + super.onErrorRecoveryCompleted(oldBackgroundError); + assertEquals(statusTestData, oldBackgroundError); + } + }; + + // test action + listener.invokeAllCallbacks(); + + // assert + assertAllEventsCalled(listener); + } + + @Test + public void testEnabledCallbacks() { + final EnabledEventCallback enabledEvents[] = { + EnabledEventCallback.ON_MEMTABLE_SEALED, EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED}; + + final CapturingTestableEventListener listener = + new CapturingTestableEventListener(enabledEvents); + + // test action + listener.invokeAllCallbacks(); + + // assert + assertEventsCalled(listener, enabledEvents); + } + + private static void assertAllEventsCalled( + final CapturingTestableEventListener capturingTestableEventListener) { + assertEventsCalled(capturingTestableEventListener, EnumSet.allOf(EnabledEventCallback.class)); + } + + private static void assertEventsCalled( + final CapturingTestableEventListener capturingTestableEventListener, + final EnabledEventCallback[] expected) { + assertEventsCalled(capturingTestableEventListener, EnumSet.copyOf(Arrays.asList(expected))); + } + + private static void assertEventsCalled( + final CapturingTestableEventListener capturingTestableEventListener, + final EnumSet expected) { + final ListenerEvents capturedEvents = capturingTestableEventListener.capturedListenerEvents; + + if (expected.contains(EnabledEventCallback.ON_FLUSH_COMPLETED)) { + assertTrue("onFlushCompleted was not called", capturedEvents.flushCompleted); + } else { + assertFalse("onFlushCompleted was not called", capturedEvents.flushCompleted); + } + + if (expected.contains(EnabledEventCallback.ON_FLUSH_BEGIN)) { + assertTrue("onFlushBegin was not called", capturedEvents.flushBegin); + } else { + assertFalse("onFlushBegin was called", capturedEvents.flushBegin); + } + + if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_DELETED)) { + assertTrue("onTableFileDeleted was not called", capturedEvents.tableFileDeleted); + } else { + assertFalse("onTableFileDeleted was called", capturedEvents.tableFileDeleted); + } + + if (expected.contains(EnabledEventCallback.ON_COMPACTION_BEGIN)) { + assertTrue("onCompactionBegin was not called", capturedEvents.compactionBegin); + } else { + assertFalse("onCompactionBegin was called", capturedEvents.compactionBegin); + } + + if (expected.contains(EnabledEventCallback.ON_COMPACTION_COMPLETED)) { + assertTrue("onCompactionCompleted was not called", capturedEvents.compactionCompleted); + } else { + assertFalse("onCompactionCompleted was called", capturedEvents.compactionCompleted); + } + + if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATED)) { + assertTrue("onTableFileCreated was not called", capturedEvents.tableFileCreated); + } else { + assertFalse("onTableFileCreated was called", capturedEvents.tableFileCreated); + } + + if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATION_STARTED)) { + assertTrue( + "onTableFileCreationStarted was not called", capturedEvents.tableFileCreationStarted); + } else { + assertFalse("onTableFileCreationStarted was called", capturedEvents.tableFileCreationStarted); + } + + if (expected.contains(EnabledEventCallback.ON_MEMTABLE_SEALED)) { + assertTrue("onMemTableSealed was not called", capturedEvents.memTableSealed); + } else { + assertFalse("onMemTableSealed was called", capturedEvents.memTableSealed); + } + + if (expected.contains(EnabledEventCallback.ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED)) { + assertTrue("onColumnFamilyHandleDeletionStarted was not called", + capturedEvents.columnFamilyHandleDeletionStarted); + } else { + assertFalse("onColumnFamilyHandleDeletionStarted was called", + capturedEvents.columnFamilyHandleDeletionStarted); + } + + if (expected.contains(EnabledEventCallback.ON_EXTERNAL_FILE_INGESTED)) { + assertTrue("onExternalFileIngested was not called", capturedEvents.externalFileIngested); + } else { + assertFalse("onExternalFileIngested was called", capturedEvents.externalFileIngested); + } + + if (expected.contains(EnabledEventCallback.ON_BACKGROUND_ERROR)) { + assertTrue("onBackgroundError was not called", capturedEvents.backgroundError); + } else { + assertFalse("onBackgroundError was called", capturedEvents.backgroundError); + } + + if (expected.contains(EnabledEventCallback.ON_STALL_CONDITIONS_CHANGED)) { + assertTrue("onStallConditionsChanged was not called", capturedEvents.stallConditionsChanged); + } else { + assertFalse("onStallConditionsChanged was called", capturedEvents.stallConditionsChanged); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_READ_FINISH)) { + assertTrue("onFileReadFinish was not called", capturedEvents.fileReadFinish); + } else { + assertFalse("onFileReadFinish was called", capturedEvents.fileReadFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_WRITE_FINISH)) { + assertTrue("onFileWriteFinish was not called", capturedEvents.fileWriteFinish); + } else { + assertFalse("onFileWriteFinish was called", capturedEvents.fileWriteFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_FLUSH_FINISH)) { + assertTrue("onFileFlushFinish was not called", capturedEvents.fileFlushFinish); + } else { + assertFalse("onFileFlushFinish was called", capturedEvents.fileFlushFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_SYNC_FINISH)) { + assertTrue("onFileSyncFinish was not called", capturedEvents.fileSyncFinish); + } else { + assertFalse("onFileSyncFinish was called", capturedEvents.fileSyncFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_RANGE_SYNC_FINISH)) { + assertTrue("onFileRangeSyncFinish was not called", capturedEvents.fileRangeSyncFinish); + } else { + assertFalse("onFileRangeSyncFinish was called", capturedEvents.fileRangeSyncFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_TRUNCATE_FINISH)) { + assertTrue("onFileTruncateFinish was not called", capturedEvents.fileTruncateFinish); + } else { + assertFalse("onFileTruncateFinish was called", capturedEvents.fileTruncateFinish); + } + + if (expected.contains(EnabledEventCallback.ON_FILE_CLOSE_FINISH)) { + assertTrue("onFileCloseFinish was not called", capturedEvents.fileCloseFinish); + } else { + assertFalse("onFileCloseFinish was called", capturedEvents.fileCloseFinish); + } + + if (expected.contains(EnabledEventCallback.SHOULD_BE_NOTIFIED_ON_FILE_IO)) { + assertTrue( + "shouldBeNotifiedOnFileIO was not called", capturedEvents.shouldBeNotifiedOnFileIO); + } else { + assertFalse("shouldBeNotifiedOnFileIO was called", capturedEvents.shouldBeNotifiedOnFileIO); + } + + if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_BEGIN)) { + assertTrue("onErrorRecoveryBegin was not called", capturedEvents.errorRecoveryBegin); + } else { + assertFalse("onErrorRecoveryBegin was called", capturedEvents.errorRecoveryBegin); + } + + if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED)) { + assertTrue("onErrorRecoveryCompleted was not called", capturedEvents.errorRecoveryCompleted); + } else { + assertFalse("onErrorRecoveryCompleted was called", capturedEvents.errorRecoveryCompleted); + } + } + + /** + * Members are volatile as they may be written + * and read by different threads. + */ + private static class ListenerEvents { + volatile boolean flushCompleted; + volatile boolean flushBegin; + volatile boolean tableFileDeleted; + volatile boolean compactionBegin; + volatile boolean compactionCompleted; + volatile boolean tableFileCreated; + volatile boolean tableFileCreationStarted; + volatile boolean memTableSealed; + volatile boolean columnFamilyHandleDeletionStarted; + volatile boolean externalFileIngested; + volatile boolean backgroundError; + volatile boolean stallConditionsChanged; + volatile boolean fileReadFinish; + volatile boolean fileWriteFinish; + volatile boolean fileFlushFinish; + volatile boolean fileSyncFinish; + volatile boolean fileRangeSyncFinish; + volatile boolean fileTruncateFinish; + volatile boolean fileCloseFinish; + volatile boolean shouldBeNotifiedOnFileIO; + volatile boolean errorRecoveryBegin; + volatile boolean errorRecoveryCompleted; + } + + private static class CapturingTestableEventListener extends TestableEventListener { + final ListenerEvents capturedListenerEvents = new ListenerEvents(); + + public CapturingTestableEventListener() {} + + public CapturingTestableEventListener(final EnabledEventCallback... enabledEventCallbacks) { + super(enabledEventCallbacks); + } + + @Override + public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) { + capturedListenerEvents.flushCompleted = true; + } + + @Override + public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { + capturedListenerEvents.flushBegin = true; + } + + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + capturedListenerEvents.tableFileDeleted = true; + } + + @Override + public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + capturedListenerEvents.compactionBegin = true; + } + + @Override + public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) { + capturedListenerEvents.compactionCompleted = true; + } + + @Override + public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) { + capturedListenerEvents.tableFileCreated = true; + } + + @Override + public void onTableFileCreationStarted( + final TableFileCreationBriefInfo tableFileCreationBriefInfo) { + capturedListenerEvents.tableFileCreationStarted = true; + } + + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + capturedListenerEvents.memTableSealed = true; + } + + @Override + public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) { + capturedListenerEvents.columnFamilyHandleDeletionStarted = true; + } + + @Override + public void onExternalFileIngested( + final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) { + capturedListenerEvents.externalFileIngested = true; + } + + @Override + public void onBackgroundError( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + capturedListenerEvents.backgroundError = true; + } + + @Override + public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) { + capturedListenerEvents.stallConditionsChanged = true; + } + + @Override + public void onFileReadFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileReadFinish = true; + } + + @Override + public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileWriteFinish = true; + } + + @Override + public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileFlushFinish = true; + } + + @Override + public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileSyncFinish = true; + } + + @Override + public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileRangeSyncFinish = true; + } + + @Override + public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileTruncateFinish = true; + } + + @Override + public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) { + capturedListenerEvents.fileCloseFinish = true; + } + + @Override + public boolean shouldBeNotifiedOnFileIO() { + capturedListenerEvents.shouldBeNotifiedOnFileIO = true; + return false; + } + + @Override + public boolean onErrorRecoveryBegin( + final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) { + capturedListenerEvents.errorRecoveryBegin = true; + return true; + } + + @Override + public void onErrorRecoveryCompleted(final Status oldBackgroundError) { + capturedListenerEvents.errorRecoveryCompleted = true; + } + } +} diff --git a/java/src/test/java/org/rocksdb/LRUCacheTest.java b/java/src/test/java/org/rocksdb/LRUCacheTest.java index d2cd15b7e97..275cb560a13 100644 --- a/java/src/test/java/org/rocksdb/LRUCacheTest.java +++ b/java/src/test/java/org/rocksdb/LRUCacheTest.java @@ -5,23 +5,27 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.ClassRule; import org.junit.Test; public class LRUCacheTest { - - static { - RocksDB.loadLibrary(); - } + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); @Test public void newLRUCache() { - final long capacity = 1000; + final long capacity = 80000000; final int numShardBits = 16; final boolean strictCapacityLimit = true; - final double highPriPoolRatio = 5; + final double highPriPoolRatio = 0.05; try(final Cache lruCache = new LRUCache(capacity, numShardBits, strictCapacityLimit, highPriPoolRatio)) { //no op + assertThat(lruCache.getUsage()).isGreaterThanOrEqualTo(0); + assertThat(lruCache.getPinnedUsage()).isGreaterThanOrEqualTo(0); } } } diff --git a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java index d1bdf0f8844..970e58c0c2e 100644 --- a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java +++ b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java @@ -15,6 +15,9 @@ import static org.junit.Assert.assertEquals; public class NativeComparatorWrapperTest { + static { + RocksDB.loadLibrary(); + } @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java index 043de032c1a..e402cb4748a 100644 --- a/java/src/test/java/org/rocksdb/OptionsTest.java +++ b/java/src/test/java/org/rocksdb/OptionsTest.java @@ -6,13 +6,13 @@ package org.rocksdb; import static org.assertj.core.api.Assertions.assertThat; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; +import static org.junit.Assert.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; +import java.util.concurrent.atomic.AtomicBoolean; import org.junit.ClassRule; import org.junit.Test; import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory; @@ -1436,4 +1436,38 @@ public void skipCheckingSstFileSizesOnDbOpen() { assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true); } } + + @Test + public void eventListeners() { + final AtomicBoolean wasCalled1 = new AtomicBoolean(); + final AtomicBoolean wasCalled2 = new AtomicBoolean(); + try (final Options options = new Options(); + final AbstractEventListener el1 = + new AbstractEventListener() { + @Override + public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) { + wasCalled1.set(true); + } + }; + final AbstractEventListener el2 = + new AbstractEventListener() { + @Override + public void onMemTableSealed(final MemTableInfo memTableInfo) { + wasCalled2.set(true); + } + }) { + assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options); + List listeners = options.listeners(); + assertEquals(el1, listeners.get(0)); + assertEquals(el2, listeners.get(1)); + options.setListeners(Collections.emptyList()); + listeners.get(0).onTableFileDeleted(null); + assertTrue(wasCalled1.get()); + listeners.get(1).onMemTableSealed(null); + assertTrue(wasCalled2.get()); + List listeners2 = options.listeners(); + assertNotNull(listeners2); + assertEquals(0, listeners2.size()); + } + } } diff --git a/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/java/src/test/java/org/rocksdb/ReadOnlyTest.java index ad6e746aa62..5b40a5df1fa 100644 --- a/java/src/test/java/org/rocksdb/ReadOnlyTest.java +++ b/java/src/test/java/org/rocksdb/ReadOnlyTest.java @@ -31,115 +31,60 @@ public void readOnlyOpen() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { db.put("key".getBytes(), "value".getBytes()); - try (final RocksDB db2 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath())) { - assertThat("value"). - isEqualTo(new String(db2.get("key".getBytes()))); - } + } + try (final RocksDB db = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath())) { + assertThat("value").isEqualTo(new String(db.get("key".getBytes()))); } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { final List cfDescriptors = new ArrayList<>(); - cfDescriptors.add(new ColumnFamilyDescriptor( - RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); - + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List columnFamilyHandleList = new ArrayList<>(); - try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, columnFamilyHandleList)) { - try (final ColumnFamilyOptions newCfOpts = new ColumnFamilyOptions(); - final ColumnFamilyOptions newCf2Opts = new ColumnFamilyOptions() - ) { - columnFamilyHandleList.add(db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf".getBytes(), newCfOpts))); - columnFamilyHandleList.add(db.createColumnFamily( - new ColumnFamilyDescriptor("new_cf2".getBytes(), newCf2Opts))); - db.put(columnFamilyHandleList.get(2), "key2".getBytes(), - "value2".getBytes()); + try (final RocksDB db = RocksDB.open( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + columnFamilyHandleList.add( + db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpts))); + columnFamilyHandleList.add( + db.createColumnFamily(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts))); + db.put(columnFamilyHandleList.get(2), "key2".getBytes(), "value2".getBytes()); + } - final List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try (final RocksDB db2 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList)) { - try (final ColumnFamilyOptions newCfOpts2 = - new ColumnFamilyOptions(); - final ColumnFamilyOptions newCf2Opts2 = - new ColumnFamilyOptions() - ) { - assertThat(db2.get("key2".getBytes())).isNull(); - assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), - "key2".getBytes())). - isNull(); - cfDescriptors.clear(); - cfDescriptors.add( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, - newCfOpts2)); - cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), - newCf2Opts2)); + columnFamilyHandleList.clear(); + try (final RocksDB db = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + assertThat(db.get("key2".getBytes())).isNull(); + assertThat(db.get(columnFamilyHandleList.get(0), "key2".getBytes())).isNull(); + } - final List readOnlyColumnFamilyHandleList2 - = new ArrayList<>(); - try (final RocksDB db3 = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList2)) { - try { - assertThat(new String(db3.get( - readOnlyColumnFamilyHandleList2.get(1), - "key2".getBytes()))).isEqualTo("value2"); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList2) { - columnFamilyHandle.close(); - } - } - } - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } - } - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - columnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + cfDescriptors.clear(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts)); + columnFamilyHandleList.clear(); + try (final RocksDB db = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { + assertThat(new String(db.get(columnFamilyHandleList.get(1), "key2".getBytes()))) + .isEqualTo("value2"); } } } @Test(expected = RocksDBException.class) public void failToWriteInReadOnly() throws RocksDBException { - try (final Options options = new Options() - .setCreateIfMissing(true)) { - - try (final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { - //no-op + try (final Options options = new Options().setCreateIfMissing(true)) { + try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + // no-op } } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); - final List readOnlyColumnFamilyHandleList = - new ArrayList<>(); - try (final RocksDB rDb = RocksDB.openReadOnly( - dbFolder.getRoot().getAbsolutePath(), cfDescriptors, - readOnlyColumnFamilyHandleList)) { - try { - // test that put fails in readonly mode - rDb.put("key".getBytes(), "value".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + final List readOnlyColumnFamilyHandleList = new ArrayList<>(); + try (final RocksDB rDb = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, readOnlyColumnFamilyHandleList)) { + // test that put fails in readonly mode + rDb.put("key".getBytes(), "value".getBytes()); } } } @@ -161,15 +106,7 @@ public void failToCFWriteInReadOnly() throws RocksDBException { try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - try { - rDb.put(readOnlyColumnFamilyHandleList.get(0), - "key".getBytes(), "value".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + rDb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); } } } @@ -193,14 +130,7 @@ public void failToRemoveInReadOnly() throws RocksDBException { try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - try { - rDb.delete("key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + rDb.delete("key".getBytes()); } } } @@ -223,15 +153,8 @@ public void failToCFRemoveInReadOnly() throws RocksDBException { try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - try { rDb.delete(readOnlyColumnFamilyHandleList.get(0), "key".getBytes()); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } } } } @@ -256,15 +179,8 @@ public void failToWriteBatchReadOnly() throws RocksDBException { readOnlyColumnFamilyHandleList); final WriteBatch wb = new WriteBatch(); final WriteOptions wOpts = new WriteOptions()) { - try { wb.put("key".getBytes(), "value".getBytes()); rDb.write(wOpts, wb); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } } } } @@ -289,16 +205,9 @@ public void failToCFWriteBatchReadOnly() throws RocksDBException { readOnlyColumnFamilyHandleList); final WriteBatch wb = new WriteBatch(); final WriteOptions wOpts = new WriteOptions()) { - try { wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); rDb.write(wOpts, wb); - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : - readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } } } } @@ -318,14 +227,7 @@ public void errorIfWalFileExists() throws RocksDBException { try (final DBOptions options = new DBOptions(); final RocksDB rDb = RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList, true);) { - try { - // no-op... should have raised an error as errorIfWalFileExists=true - - } finally { - for (final ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) { - columnFamilyHandle.close(); - } - } + // no-op... should have raised an error as errorIfWalFileExists=true } } } diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java index 675023ef3a2..689c48cb0ef 100644 --- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java @@ -159,6 +159,8 @@ public void iterateUpperBound() { Slice upperBound = buildRandomSlice(); opt.setIterateUpperBound(upperBound); assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue(); + opt.setIterateUpperBound(null); + assertThat(opt.iterateUpperBound()).isNull(); } } @@ -175,6 +177,8 @@ public void iterateLowerBound() { Slice lowerBound = buildRandomSlice(); opt.setIterateLowerBound(lowerBound); assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue(); + opt.setIterateLowerBound(null); + assertThat(opt.iterateLowerBound()).isNull(); } } diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index fc62dc80e1f..20588084c80 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -1271,6 +1271,26 @@ public void getApproximateMemTableStats() throws RocksDBException { } } + @Test + public void getApproximateMemTableStatsSingleKey() throws RocksDBException { + final byte key1[] = "key1".getBytes(UTF_8); + final byte key2[] = "key2".getBytes(UTF_8); + final byte key3[] = "key3".getBytes(UTF_8); + try (final Options options = new Options().setCreateIfMissing(true)) { + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + try (final RocksDB db = RocksDB.open(options, dbPath)) { + db.put(key1, key1); + + final RocksDB.CountAndSize stats = + db.getApproximateMemTableStats(new Range(new Slice(key1), new Slice(key3))); + + assertThat(stats).isNotNull(); + assertThat(stats.count).isEqualTo(1); + assertThat(stats.size).isGreaterThan(1); + } + } + } + @Ignore("TODO(AR) re-enable when ready!") @Test public void compactFiles() throws RocksDBException { @@ -1456,11 +1476,11 @@ public void getLiveFiles() throws RocksDBException { try (final RocksDB db = RocksDB.open(options, dbPath)) { final RocksDB.LiveFiles livefiles = db.getLiveFiles(true); assertThat(livefiles).isNotNull(); - assertThat(livefiles.manifestFileSize).isEqualTo(13); + assertThat(livefiles.manifestFileSize).isEqualTo(57); assertThat(livefiles.files.size()).isEqualTo(3); assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT"); - assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001"); - assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005"); + assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000004"); + assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007"); } } } diff --git a/java/src/test/java/org/rocksdb/test/TestableEventListener.java b/java/src/test/java/org/rocksdb/test/TestableEventListener.java new file mode 100644 index 00000000000..865ad5cf78b --- /dev/null +++ b/java/src/test/java/org/rocksdb/test/TestableEventListener.java @@ -0,0 +1,23 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb.test; + +import org.rocksdb.AbstractEventListener; + +public class TestableEventListener extends AbstractEventListener { + public TestableEventListener() { + super(); + } + + public TestableEventListener(final EnabledEventCallback... enabledEventCallbacks) { + super(enabledEventCallbacks); + } + + public void invokeAllCallbacks() { + invokeAllCallbacks(nativeHandle_); + } + + private static native void invokeAllCallbacks(final long handle); +} diff --git a/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java b/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java index 89081947193..f80e69c1c60 100644 --- a/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java +++ b/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java @@ -156,8 +156,10 @@ public boolean equals(final Object o) { @Override public int hashCode() { - - return Objects.hash(action, columnFamilyId, key, value); + int result = Objects.hash(action, columnFamilyId); + result = 31 * result + Arrays.hashCode(key); + result = 31 * result + Arrays.hashCode(value); + return result; } } diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc index 3533724ba34..1ff08c1adef 100644 --- a/logging/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -6,8 +6,12 @@ #include "logging/auto_roll_logger.h" #include + #include "file/filename.h" #include "logging/logging.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { @@ -15,7 +19,9 @@ namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE // -- AutoRollLogger -AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, +AutoRollLogger::AutoRollLogger(const std::shared_ptr& fs, + const std::shared_ptr& clock, + const std::string& dbname, const std::string& db_log_dir, size_t log_max_size, size_t log_file_time_to_roll, @@ -24,24 +30,26 @@ AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, : Logger(log_level), dbname_(dbname), db_log_dir_(db_log_dir), - env_(env), + fs_(fs), + clock_(clock), status_(Status::OK()), kMaxLogFileSize(log_max_size), kLogFileTimeToRoll(log_file_time_to_roll), kKeepLogFileNum(keep_log_file_num), - cached_now(static_cast(env_->NowMicros() * 1e-6)), + cached_now(static_cast(clock_->NowMicros() * 1e-6)), ctime_(cached_now), cached_now_access_count(0), call_NowMicros_every_N_records_(100), mutex_() { - Status s = env->GetAbsolutePath(dbname, &db_absolute_path_); + Status s = fs->GetAbsolutePath(dbname, io_options_, &db_absolute_path_, + &io_context_); if (s.IsNotSupported()) { db_absolute_path_ = dbname; } else { status_ = s; } log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); - if (env_->FileExists(log_fname_).ok()) { + if (fs_->FileExists(log_fname_, io_options_, &io_context_).ok()) { RollLogFile(); } GetExistingFiles(); @@ -53,7 +61,7 @@ AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, Status AutoRollLogger::ResetLogger() { TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); - status_ = env_->NewLogger(log_fname_, &logger_); + status_ = fs_->NewLogger(log_fname_, io_options_, &logger_, &io_context_); TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger"); if (!status_.ok()) { @@ -67,7 +75,7 @@ Status AutoRollLogger::ResetLogger() { "The underlying logger doesn't support GetLogFileSize()"); } if (status_.ok()) { - cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now = static_cast(clock_->NowMicros() * 1e-6); ctime_ = cached_now; cached_now_access_count = 0; } @@ -79,14 +87,14 @@ void AutoRollLogger::RollLogFile() { // This function is called when log is rotating. Two rotations // can happen quickly (NowMicro returns same value). To not overwrite // previous log file we increment by one micro second and try again. - uint64_t now = env_->NowMicros(); + uint64_t now = clock_->NowMicros(); std::string old_fname; do { old_fname = OldInfoLogFileName( dbname_, now, db_absolute_path_, db_log_dir_); now++; - } while (env_->FileExists(old_fname).ok()); - Status s = env_->RenameFile(log_fname_, old_fname); + } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok()); + Status s = fs_->RenameFile(log_fname_, old_fname, io_options_, &io_context_); if (!s.ok()) { // What should we do on error? } @@ -103,7 +111,7 @@ void AutoRollLogger::GetExistingFiles() { std::string parent_dir; std::vector info_log_files; Status s = - GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files); + GetInfoLogFiles(fs_, db_log_dir_, dbname_, &parent_dir, &info_log_files); if (status_.ok()) { status_ = s; } @@ -117,7 +125,7 @@ void AutoRollLogger::GetExistingFiles() { } Status AutoRollLogger::TrimOldLogFiles() { - // Here we directly list info files and delete them through Env. + // Here we directly list info files and delete them through FileSystem. // The deletion isn't going through DB, so there are shortcomes: // 1. the deletion is not rate limited by SstFileManager // 2. there is a chance that an I/O will be issued here @@ -130,7 +138,8 @@ Status AutoRollLogger::TrimOldLogFiles() { // it's essentially the same thing, and checking empty before accessing // the queue feels safer. while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) { - Status s = env_->DeleteFile(old_log_files_.front()); + Status s = + fs_->DeleteFile(old_log_files_.front(), io_options_, &io_context_); // Remove the file from the tracking anyway. It's possible that // DB cleaned up the old log file, or people cleaned it up manually. old_log_files_.pop(); @@ -241,7 +250,7 @@ void AutoRollLogger::LogHeader(const char* format, va_list args) { bool AutoRollLogger::LogExpired() { if (cached_now_access_count >= call_NowMicros_every_N_records_) { - cached_now = static_cast(env_->NowMicros() * 1e-6); + cached_now = static_cast(clock_->NowMicros() * 1e-6); cached_now_access_count = 0; } @@ -267,15 +276,16 @@ Status CreateLoggerFromOptions(const std::string& dbname, std::string fname = InfoLogFileName(dbname, db_absolute_path, options.db_log_dir); + const auto& clock = env->GetSystemClock(); env->CreateDirIfMissing(dbname) .PermitUncheckedError(); // In case it does not exist // Currently we only support roll by time-to-roll and log size #ifndef ROCKSDB_LITE if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( - env, dbname, options.db_log_dir, options.max_log_file_size, - options.log_file_time_to_roll, options.keep_log_file_num, - options.info_log_level); + env->GetFileSystem(), clock, dbname, options.db_log_dir, + options.max_log_file_size, options.log_file_time_to_roll, + options.keep_log_file_num, options.info_log_level); s = result->GetStatus(); if (!s.ok()) { delete result; @@ -286,9 +296,9 @@ Status CreateLoggerFromOptions(const std::string& dbname, } #endif // !ROCKSDB_LITE // Open a log file in the same directory as the db - env->RenameFile(fname, - OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path, - options.db_log_dir)) + env->RenameFile( + fname, OldInfoLogFileName(dbname, clock->NowMicros(), db_absolute_path, + options.db_log_dir)) .PermitUncheckedError(); s = env->NewLogger(fname, logger); if (logger->get() != nullptr) { diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h index 2b63fc9d9c5..ccbce1d9940 100644 --- a/logging/auto_roll_logger.h +++ b/logging/auto_roll_logger.h @@ -18,14 +18,18 @@ #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +class FileSystem; +class SystemClock; #ifndef ROCKSDB_LITE // Rolls the log file by size and/or time class AutoRollLogger : public Logger { public: - AutoRollLogger(Env* env, const std::string& dbname, - const std::string& db_log_dir, size_t log_max_size, - size_t log_file_time_to_roll, size_t keep_log_file_num, + AutoRollLogger(const std::shared_ptr& fs, + const std::shared_ptr& clock, + const std::string& dbname, const std::string& db_log_dir, + size_t log_max_size, size_t log_file_time_to_roll, + size_t keep_log_file_num, const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL); using Logger::Logv; @@ -134,7 +138,8 @@ class AutoRollLogger : public Logger { std::string dbname_; std::string db_log_dir_; std::string db_absolute_path_; - Env* env_; + std::shared_ptr fs_; + std::shared_ptr clock_; std::shared_ptr logger_; // current status of the logger Status status_; @@ -148,11 +153,13 @@ class AutoRollLogger : public Logger { // Full path is stored here. It consumes signifianctly more memory // than only storing file name. Can optimize if it causes a problem. std::queue old_log_files_; - // to avoid frequent env->NowMicros() calls, we cached the current time + // to avoid frequent clock->NowMicros() calls, we cached the current time uint64_t cached_now; uint64_t ctime_; uint64_t cached_now_access_count; uint64_t call_NowMicros_every_N_records_; + IOOptions io_options_; + IODebugContext io_context_; mutable port::Mutex mutex_; }; #endif // !ROCKSDB_LITE diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index 520d9de2354..59e0ebac658 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -7,8 +7,9 @@ #ifndef ROCKSDB_LITE #include "logging/auto_roll_logger.h" -#include + #include + #include #include #include @@ -17,18 +18,24 @@ #include #include #include + #include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "test_util/testutil.h" namespace ROCKSDB_NAMESPACE { namespace { -class NoSleepEnv : public EnvWrapper { +class NoSleepClock : public SystemClockWrapper { public: - NoSleepEnv(Env* base) : EnvWrapper(base) {} + NoSleepClock( + const std::shared_ptr& base = SystemClock::Default()) + : SystemClockWrapper(base) {} + const char* Name() const override { return "NoSleepClock"; } void SleepForMicroseconds(int micros) override { fake_time_ += static_cast(micros); } @@ -76,7 +83,9 @@ class AutoRollLoggerTest : public testing::Test { void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, const std::string& log_message); - void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time, + void RollLogFileByTimeTest(const std::shared_ptr& fs, + const std::shared_ptr& sc, + AutoRollLogger* logger, size_t time, const std::string& log_message); // return list of files under kTestDir that contains "LOG" std::vector GetLogFiles() { @@ -157,21 +166,22 @@ void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, ASSERT_TRUE(message_size == logger->GetLogFileSize()); } -void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger, - size_t time, - const std::string& log_message) { +void AutoRollLoggerTest::RollLogFileByTimeTest( + const std::shared_ptr& fs, + const std::shared_ptr& sc, AutoRollLogger* logger, size_t time, + const std::string& log_message) { uint64_t expected_ctime; uint64_t actual_ctime; uint64_t total_log_size; - EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size)); + EXPECT_OK(fs->GetFileSize(kLogFile, IOOptions(), &total_log_size, nullptr)); expected_ctime = logger->TEST_ctime(); logger->SetCallNowMicrosEveryNRecords(0); // -- Write to the log for several times, which is supposed // to be finished before time. for (int i = 0; i < 10; ++i) { - env->SleepForMicroseconds(50000); + sc->SleepForMicroseconds(50000); LogMessage(logger, log_message.c_str()); EXPECT_OK(logger->GetStatus()); // Make sure we always write to the same log file (by @@ -186,7 +196,7 @@ void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger, } // -- Make the log file expire - env->SleepForMicroseconds(static_cast(time * 1000000)); + sc->SleepForMicroseconds(static_cast(time * 1000000)); LogMessage(logger, log_message.c_str()); // At this time, the new log file should be created. @@ -200,15 +210,15 @@ TEST_F(AutoRollLoggerTest, RollLogFileBySize) { size_t log_max_size = 1024 * 5; size_t keep_log_file_num = 10; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0, - keep_log_file_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_max_size, 0, keep_log_file_num); RollLogFileBySizeTest(&logger, log_max_size, kSampleMessage + ":RollLogFileBySize"); } TEST_F(AutoRollLoggerTest, RollLogFileByTime) { - NoSleepEnv nse(Env::Default()); + auto nsc = std::make_shared(); size_t time = 2; size_t log_size = 1024 * 5; @@ -217,10 +227,11 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) { InitTestDb(); // -- Test the existence of file during the server restart. ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile)); - AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num); + AutoRollLogger logger(default_env->GetFileSystem(), nsc, kTestDir, "", + log_size, time, keep_log_file_num); ASSERT_OK(default_env->FileExists(kLogFile)); - RollLogFileByTimeTest(&nse, &logger, time, + RollLogFileByTimeTest(default_env->GetFileSystem(), nsc, &logger, time, kSampleMessage + ":RollLogFileByTime"); } @@ -255,15 +266,17 @@ TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) { size_t log_size = 1024; size_t keep_log_file_num = 10; - AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "", - log_size, 0, keep_log_file_num); + AutoRollLogger* logger = + new AutoRollLogger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_size, 0, keep_log_file_num); LogMessage(logger, kSampleMessage.c_str()); ASSERT_GT(logger->GetLogFileSize(), kZero); delete logger; // reopens the log file and an empty log file will be created. - logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10); + logger = new AutoRollLogger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_size, 0, 10); ASSERT_EQ(logger->GetLogFileSize(), kZero); delete logger; } @@ -274,16 +287,16 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { InitTestDb(); - NoSleepEnv nse(Env::Default()); - AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time, - keep_log_file_num); + auto nsc = std::make_shared(); + AutoRollLogger logger(FileSystem::Default(), nsc, kTestDir, "", log_max_size, + time, keep_log_file_num); // Test the ability to roll by size RollLogFileBySizeTest(&logger, log_max_size, kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); // Test the ability to roll by Time - RollLogFileByTimeTest(&nse, &logger, time, + RollLogFileByTimeTest(FileSystem::Default(), nsc, &logger, time, kSampleMessage + ":CompositeRollByTimeAndSizeLogger"); } @@ -292,7 +305,9 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { // port TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { DBOptions options; - NoSleepEnv nse(Env::Default()); + auto nsc = std::make_shared(); + std::unique_ptr nse(new CompositeEnvWrapper(Env::Default(), nsc)); + std::shared_ptr logger; // Normal logger @@ -311,14 +326,15 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { kSampleMessage + ":CreateLoggerFromOptions - size"); // Only roll by Time - options.env = &nse; + options.env = nse.get(); InitTestDb(); options.max_log_file_size = 0; options.log_file_time_to_roll = 2; ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); auto_roll_logger = dynamic_cast(logger.get()); - RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, + RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger, + options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - time"); // roll by both Time and size @@ -330,7 +346,8 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { dynamic_cast(logger.get()); RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, kSampleMessage + ":CreateLoggerFromOptions - both"); - RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, + RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger, + options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - both"); // Set keep_log_file_num @@ -403,8 +420,8 @@ TEST_F(AutoRollLoggerTest, AutoDeleting) { const size_t kMaxFileSize = 512; { size_t log_num = 8; - AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, - log_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + dbname, db_log_dir, kMaxFileSize, 0, log_num); RollNTimesBySize(&logger, log_num, kMaxFileSize); ASSERT_EQ(log_num, GetLogFiles().size()); @@ -412,8 +429,8 @@ TEST_F(AutoRollLoggerTest, AutoDeleting) { // Shrink number of files { size_t log_num = 5; - AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, - log_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + dbname, db_log_dir, kMaxFileSize, 0, log_num); ASSERT_EQ(log_num, GetLogFiles().size()); RollNTimesBySize(&logger, 3, kMaxFileSize); @@ -423,8 +440,8 @@ TEST_F(AutoRollLoggerTest, AutoDeleting) { // Increase number of files again. { size_t log_num = 7; - AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, - log_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + dbname, db_log_dir, kMaxFileSize, 0, log_num); ASSERT_EQ(6, GetLogFiles().size()); RollNTimesBySize(&logger, 3, kMaxFileSize); @@ -486,7 +503,8 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) { // an extra-scope to force the AutoRollLogger to flush the log file when it // becomes out of scope. { - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + kTestDir, "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -524,7 +542,8 @@ TEST_F(AutoRollLoggerTest, Close) { size_t log_size = 8192; size_t log_lines = 0; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir, + "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -591,8 +610,9 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { InitTestDb(); - AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"", - LOG_MAX_SIZE, /*log_file_time_to_roll=*/0, + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), + kTestDir, /*db_log_dir=*/"", LOG_MAX_SIZE, + /*log_file_time_to_roll=*/0, /*keep_log_file_num=*/10); if (test_num == 0) { diff --git a/logging/env_logger.h b/logging/env_logger.h index 9fecb50cf94..e8e9f1abe31 100644 --- a/logging/env_logger.h +++ b/logging/env_logger.h @@ -31,15 +31,16 @@ class EnvLogger : public Logger { const std::string& fname, const EnvOptions& options, Env* env, InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) : Logger(log_level), - file_(std::move(writable_file), fname, options, env), - last_flush_micros_(0), env_(env), + clock_(env_->GetSystemClock().get()), + file_(std::move(writable_file), fname, options, clock_), + last_flush_micros_(0), flush_pending_(false) {} ~EnvLogger() { if (!closed_) { closed_ = true; - CloseHelper(); + CloseHelper().PermitUncheckedError(); } } @@ -48,9 +49,9 @@ class EnvLogger : public Logger { mutex_.AssertHeld(); if (flush_pending_) { flush_pending_ = false; - file_.Flush(); + file_.Flush().PermitUncheckedError(); } - last_flush_micros_ = env_->NowMicros(); + last_flush_micros_ = clock_->NowMicros(); } void Flush() override { @@ -134,9 +135,9 @@ class EnvLogger : public Logger { assert(p <= limit); mutex_.Lock(); // We will ignore any error returned by Append(). - file_.Append(Slice(base, p - base)); + file_.Append(Slice(base, p - base)).PermitUncheckedError(); flush_pending_ = true; - const uint64_t now_micros = env_->NowMicros(); + const uint64_t now_micros = clock_->NowMicros(); if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { FlushLocked(); } @@ -154,11 +155,12 @@ class EnvLogger : public Logger { } private: + Env* env_; + SystemClock* clock_; WritableFileWriter file_; mutable port::Mutex mutex_; // Mutex to protect the shared variables below. const static uint64_t flush_every_seconds_ = 5; std::atomic_uint_fast64_t last_flush_micros_; - Env* env_; std::atomic flush_pending_; }; diff --git a/logging/event_logger.cc b/logging/event_logger.cc index f1747ad253a..78bf4f8ff5b 100644 --- a/logging/event_logger.cc +++ b/logging/event_logger.cc @@ -10,7 +10,6 @@ #include #include -#include "logging/logging.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { diff --git a/memory/arena.cc b/memory/arena.cc index ed46459d9cc..bcdad5c76fa 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -12,11 +12,13 @@ #include #endif #include + #include "logging/logging.h" #include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -160,7 +162,7 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, #ifdef MAP_HUGETLB if (huge_page_size > 0 && bytes > 0) { - // Allocate from a huge page TBL table. + // Allocate from a huge page TLB table. assert(logger != nullptr); // logger need to be passed in. size_t reserved_size = ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; @@ -170,7 +172,7 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, if (addr == nullptr) { ROCKS_LOG_WARN(logger, "AllocateAligned fail to allocate huge TLB pages: %s", - strerror(errno)); + errnoStr(errno).c_str()); // fail back to malloc } else { return addr; diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc index 980b08b955f..a01034e3fd6 100644 --- a/memory/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -132,6 +132,9 @@ size_t JemallocNodumpAllocator::UsableSize(void* p, Status NewJemallocNodumpAllocator( JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator) { + if (memory_allocator == nullptr) { + return Status::InvalidArgument("memory_allocator must be non-null."); + } *memory_allocator = nullptr; Status unsupported = Status::NotSupported( "JemallocNodumpAllocator only available with jemalloc version >= 5 " @@ -143,9 +146,6 @@ Status NewJemallocNodumpAllocator( if (!HasJemalloc()) { return unsupported; } - if (memory_allocator == nullptr) { - return Status::InvalidArgument("memory_allocator must be non-null."); - } if (options.limit_tcache_size && options.tcache_size_lower_bound >= options.tcache_size_upper_bound) { return Status::InvalidArgument( diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index 0f620304247..d476d03fb1d 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -31,6 +31,7 @@ int main() { #include "rocksdb/memtablerep.h" #include "rocksdb/options.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "rocksdb/write_buffer_manager.h" #include "test_util/testutil.h" #include "util/gflags_compat.h" @@ -417,7 +418,7 @@ class Benchmark { uint64_t bytes_written = 0; uint64_t bytes_read = 0; uint64_t read_hits = 0; - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(SystemClock::Default().get(), true); RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits); auto elapsed_time = static_cast(timer.ElapsedNanos() / 1000); std::cout << "Elapsed time: " << static_cast(elapsed_time) << " us" diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc index 9b747087081..ecbccb82b3d 100644 --- a/memtable/write_buffer_manager.cc +++ b/memtable/write_buffer_manager.cc @@ -8,7 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/write_buffer_manager.h" -#include + +#include "cache/cache_entry_roles.h" +#include "db/db_impl/db_impl.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { @@ -49,12 +51,16 @@ struct WriteBufferManager::CacheRep {}; #endif // ROCKSDB_LITE WriteBufferManager::WriteBufferManager(size_t _buffer_size, - std::shared_ptr cache) + std::shared_ptr cache, + bool allow_stall) : buffer_size_(_buffer_size), mutable_limit_(buffer_size_ * 7 / 8), memory_used_(0), memory_active_(0), - cache_rep_(nullptr) { + dummy_size_(0), + cache_rep_(nullptr), + allow_stall_(allow_stall), + stall_active_(false) { #ifndef ROCKSDB_LITE if (cache) { // Construct the cache key using the pointer to this. @@ -77,6 +83,17 @@ WriteBufferManager::~WriteBufferManager() { #endif // ROCKSDB_LITE } +void WriteBufferManager::ReserveMem(size_t mem) { + if (cache_rep_ != nullptr) { + ReserveMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_add(mem, std::memory_order_relaxed); + } + if (enabled()) { + memory_active_.fetch_add(mem, std::memory_order_relaxed); + } +} + // Should only be called from write thread void WriteBufferManager::ReserveMemWithCache(size_t mem) { #ifndef ROCKSDB_LITE @@ -91,9 +108,9 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) { // Expand size by at least 256KB. // Add a dummy record to the cache Cache::Handle* handle = nullptr; - Status s = - cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr, - kSizeDummyEntry, nullptr, &handle); + Status s = cache_rep_->cache_->Insert( + cache_rep_->GetNextCacheKey(), nullptr, kSizeDummyEntry, + GetNoopDeleterForRole(), &handle); s.PermitUncheckedError(); // TODO: What to do on error? // We keep the handle even if insertion fails and a null handle is // returned, so that when memory shrinks, we don't release extra @@ -104,12 +121,31 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) { // it in the future. cache_rep_->dummy_handles_.push_back(handle); cache_rep_->cache_allocated_size_ += kSizeDummyEntry; + dummy_size_.fetch_add(kSizeDummyEntry, std::memory_order_relaxed); } #else (void)mem; #endif // ROCKSDB_LITE } +void WriteBufferManager::ScheduleFreeMem(size_t mem) { + if (enabled()) { + memory_active_.fetch_sub(mem, std::memory_order_relaxed); + } +} + +void WriteBufferManager::FreeMem(size_t mem) { + if (cache_rep_ != nullptr) { + FreeMemWithCache(mem); + } else if (enabled()) { + memory_used_.fetch_sub(mem, std::memory_order_relaxed); + } + // Check if stall is active and can be ended. + if (allow_stall_) { + EndWriteStall(); + } +} + void WriteBufferManager::FreeMemWithCache(size_t mem) { #ifndef ROCKSDB_LITE assert(cache_rep_ != nullptr); @@ -137,9 +173,56 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) { } cache_rep_->dummy_handles_.pop_back(); cache_rep_->cache_allocated_size_ -= kSizeDummyEntry; + dummy_size_.fetch_sub(kSizeDummyEntry, std::memory_order_relaxed); } #else (void)mem; #endif // ROCKSDB_LITE } + +void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { + assert(wbm_stall != nullptr); + if (wbm_stall) { + std::unique_lock lock(mu_); + queue_.push_back(wbm_stall); + } + // In case thread enqueue itself and memory got freed in parallel, end the + // stall. + if (!ShouldStall()) { + EndWriteStall(); + } +} + +// Called when memory is freed in FreeMem. +void WriteBufferManager::EndWriteStall() { + if (enabled() && !IsStallThresholdExceeded()) { + { + std::unique_lock lock(mu_); + stall_active_.store(false, std::memory_order_relaxed); + if (queue_.empty()) { + return; + } + } + + // Get the instances from the list and call WBMStallInterface::Signal to + // change the state to running and unblock the DB instances. + // Check ShouldStall() incase stall got active by other DBs. + while (!ShouldStall() && !queue_.empty()) { + std::unique_lock lock(mu_); + StallInterface* wbm_stall = queue_.front(); + queue_.pop_front(); + wbm_stall->Signal(); + } + } +} + +void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { + assert(wbm_stall != nullptr); + if (enabled() && allow_stall_) { + std::unique_lock lock(mu_); + queue_.remove(wbm_stall); + wbm_stall->Signal(); + } +} + } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 0cdd7c4780b..7e3de41d18a 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -11,10 +11,11 @@ #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { - class WriteBufferManagerTest : public testing::Test {}; #ifndef ROCKSDB_LITE +const size_t kSizeDummyEntry = 256 * 1024; + TEST_F(WriteBufferManagerTest, ShouldFlush) { // A write buffer manager of size 10MB std::unique_ptr wbf( @@ -46,7 +47,33 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { ASSERT_TRUE(wbf->ShouldFlush()); wbf->FreeMem(7 * 1024 * 1024); - // 9MB total, 8MB mutable. + // 8MB total, 8MB mutable. + ASSERT_FALSE(wbf->ShouldFlush()); + + // change size: 8M limit, 7M mutable limit + wbf->SetBufferSize(8 * 1024 * 1024); + // 8MB total, 8MB mutable. + ASSERT_TRUE(wbf->ShouldFlush()); + + wbf->ScheduleFreeMem(2 * 1024 * 1024); + // 8MB total, 6MB mutable. + ASSERT_TRUE(wbf->ShouldFlush()); + + wbf->FreeMem(2 * 1024 * 1024); + // 6MB total, 6MB mutable. + ASSERT_FALSE(wbf->ShouldFlush()); + + wbf->ReserveMem(1 * 1024 * 1024); + // 7MB total, 7MB mutable. + ASSERT_FALSE(wbf->ShouldFlush()); + + wbf->ReserveMem(1 * 1024 * 1024); + // 8MB total, 8MB mutable. + ASSERT_TRUE(wbf->ShouldFlush()); + + wbf->ScheduleFreeMem(1 * 1024 * 1024); + wbf->FreeMem(1 * 1024 * 1024); + // 7MB total, 7MB mutable. ASSERT_FALSE(wbf->ShouldFlush()); } @@ -65,28 +92,35 @@ TEST_F(WriteBufferManagerTest, CacheCost) { wbf->ReserveMem(333 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 2 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + 10000); + // 2 dummy entries are added for size 333 kb. + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 2 * kSizeDummyEntry); // Allocate another 512KB wbf->ReserveMem(512 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + 10000); + // 2 more dummy entries are added for size 512. + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); // Allocate another 10MB wbf->ReserveMem(10 * 1024 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000); + // 40 more entries are added for size 10 * 1024 * 1024. + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); // Free 1MB will not cause any change in cache cost wbf->FreeMem(1024 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000); - + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); ASSERT_FALSE(wbf->ShouldFlush()); // Allocate another 41MB wbf->ReserveMem(41 * 1024 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry); ASSERT_TRUE(wbf->ShouldFlush()); ASSERT_TRUE(wbf->ShouldFlush()); @@ -94,7 +128,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { wbf->ScheduleFreeMem(20 * 1024 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000); - + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry); // Still need flush as the hard limit hits ASSERT_TRUE(wbf->ShouldFlush()); @@ -102,6 +136,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) { wbf->FreeMem(20 * 1024 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 203 * kSizeDummyEntry); ASSERT_FALSE(wbf->ShouldFlush()); @@ -109,19 +144,23 @@ TEST_F(WriteBufferManagerTest, CacheCost) { wbf->FreeMem(16 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 202 * kSizeDummyEntry); wbf->FreeMem(16 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 201 * kSizeDummyEntry); // Reserve 512KB will not cause any change in cache cost wbf->ReserveMem(512 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 201 * kSizeDummyEntry); wbf->FreeMem(16 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 200 * kSizeDummyEntry); // Destory write buffer manger should free everything wbf.reset(); @@ -137,6 +176,7 @@ TEST_F(WriteBufferManagerTest, NoCapCacheCost) { wbf->ReserveMem(10 * 1024 * 1024); ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 10 * 1024 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry); ASSERT_FALSE(wbf->ShouldFlush()); wbf->FreeMem(9 * 1024 * 1024); @@ -145,6 +185,7 @@ TEST_F(WriteBufferManagerTest, NoCapCacheCost) { } ASSERT_GE(cache->GetPinnedUsage(), 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024 + 10000); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry); } TEST_F(WriteBufferManagerTest, CacheFull) { @@ -156,16 +197,20 @@ TEST_F(WriteBufferManagerTest, CacheFull) { std::shared_ptr cache = NewLRUCache(lo); std::unique_ptr wbf(new WriteBufferManager(0, cache)); wbf->ReserveMem(10 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry); size_t prev_pinned = cache->GetPinnedUsage(); ASSERT_GE(prev_pinned, 10 * 1024 * 1024); + // Some insert will fail wbf->ReserveMem(10 * 1024 * 1024); ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry); // Increase capacity so next insert will succeed cache->SetCapacity(30 * 1024 * 1024); wbf->ReserveMem(10 * 1024 * 1024); ASSERT_GT(cache->GetPinnedUsage(), 20 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 120 * kSizeDummyEntry); // Gradually release 20 MB for (int i = 0; i < 40; i++) { @@ -173,6 +218,7 @@ TEST_F(WriteBufferManagerTest, CacheFull) { } ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 20 * 1024 * 1024); + ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 95 * kSizeDummyEntry); } #endif // ROCKSDB_LITE diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc index 36a7d71542c..fd7c004378b 100644 --- a/monitoring/histogram_test.cc +++ b/monitoring/histogram_test.cc @@ -3,11 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // +#include "monitoring/histogram.h" + #include -#include "monitoring/histogram.h" #include "monitoring/histogram_windowing.h" +#include "rocksdb/system_clock.h" +#include "test_util/mock_time_env.h" #include "test_util/testharness.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -16,16 +20,22 @@ class HistogramTest : public testing::Test {}; namespace { const double kIota = 0.1; const HistogramBucketMapper bucketMapper; - Env* env = Env::Default(); + std::shared_ptr clock = + std::make_shared(SystemClock::Default()); } void PopulateHistogram(Histogram& histogram, uint64_t low, uint64_t high, uint64_t loop = 1) { + Random rnd(test::RandomSeed()); for (; loop > 0; loop--) { for (uint64_t i = low; i <= high; i++) { histogram.Add(i); + // sleep a random microseconds [0-10) + clock->MockSleepForMicroseconds(rnd.Uniform(10)); } } + // make sure each data population at least take some time + clock->MockSleepForMicroseconds(1); } void BasicOperation(Histogram& histogram) { @@ -131,23 +141,23 @@ TEST_F(HistogramTest, HistogramWindowingExpire) { HistogramWindowingImpl histogramWindowing(num_windows, micros_per_window, min_num_per_window); - + histogramWindowing.TEST_UpdateClock(clock); PopulateHistogram(histogramWindowing, 1, 1, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 100); ASSERT_EQ(histogramWindowing.min(), 1); ASSERT_EQ(histogramWindowing.max(), 1); ASSERT_EQ(histogramWindowing.Average(), 1); PopulateHistogram(histogramWindowing, 2, 2, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 200); ASSERT_EQ(histogramWindowing.min(), 1); ASSERT_EQ(histogramWindowing.max(), 2); ASSERT_EQ(histogramWindowing.Average(), 1.5); PopulateHistogram(histogramWindowing, 3, 3, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 300); ASSERT_EQ(histogramWindowing.min(), 1); ASSERT_EQ(histogramWindowing.max(), 3); @@ -155,7 +165,7 @@ TEST_F(HistogramTest, HistogramWindowingExpire) { // dropping oldest window with value 1, remaining 2 ~ 4 PopulateHistogram(histogramWindowing, 4, 4, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 300); ASSERT_EQ(histogramWindowing.min(), 2); ASSERT_EQ(histogramWindowing.max(), 4); @@ -163,7 +173,7 @@ TEST_F(HistogramTest, HistogramWindowingExpire) { // dropping oldest window with value 2, remaining 3 ~ 5 PopulateHistogram(histogramWindowing, 5, 5, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 300); ASSERT_EQ(histogramWindowing.min(), 3); ASSERT_EQ(histogramWindowing.max(), 5); @@ -179,18 +189,20 @@ TEST_F(HistogramTest, HistogramWindowingMerge) { histogramWindowing(num_windows, micros_per_window, min_num_per_window); HistogramWindowingImpl otherWindowing(num_windows, micros_per_window, min_num_per_window); + histogramWindowing.TEST_UpdateClock(clock); + otherWindowing.TEST_UpdateClock(clock); PopulateHistogram(histogramWindowing, 1, 1, 100); PopulateHistogram(otherWindowing, 1, 1, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); PopulateHistogram(histogramWindowing, 2, 2, 100); PopulateHistogram(otherWindowing, 2, 2, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); PopulateHistogram(histogramWindowing, 3, 3, 100); PopulateHistogram(otherWindowing, 3, 3, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); histogramWindowing.Merge(otherWindowing); ASSERT_EQ(histogramWindowing.num(), 600); @@ -200,14 +212,14 @@ TEST_F(HistogramTest, HistogramWindowingMerge) { // dropping oldest window with value 1, remaining 2 ~ 4 PopulateHistogram(histogramWindowing, 4, 4, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 500); ASSERT_EQ(histogramWindowing.min(), 2); ASSERT_EQ(histogramWindowing.max(), 4); // dropping oldest window with value 2, remaining 3 ~ 5 PopulateHistogram(histogramWindowing, 5, 5, 100); - env->SleepForMicroseconds(micros_per_window); + clock->MockSleepForMicroseconds(micros_per_window); ASSERT_EQ(histogramWindowing.num(), 400); ASSERT_EQ(histogramWindowing.min(), 3); ASSERT_EQ(histogramWindowing.max(), 5); diff --git a/monitoring/histogram_windowing.cc b/monitoring/histogram_windowing.cc index e114a6686c0..f31bbe06ace 100644 --- a/monitoring/histogram_windowing.cc +++ b/monitoring/histogram_windowing.cc @@ -8,15 +8,17 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "monitoring/histogram_windowing.h" -#include "monitoring/histogram.h" -#include "util/cast_util.h" #include +#include "monitoring/histogram.h" +#include "rocksdb/system_clock.h" +#include "util/cast_util.h" + namespace ROCKSDB_NAMESPACE { HistogramWindowingImpl::HistogramWindowingImpl() { - env_ = Env::Default(); + clock_ = SystemClock::Default(); window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); Clear(); } @@ -28,7 +30,7 @@ HistogramWindowingImpl::HistogramWindowingImpl( num_windows_(num_windows), micros_per_window_(micros_per_window), min_num_per_window_(min_num_per_window) { - env_ = Env::Default(); + clock_ = SystemClock::Default(); window_stats_.reset(new HistogramStat[static_cast(num_windows_)]); Clear(); } @@ -44,7 +46,7 @@ void HistogramWindowingImpl::Clear() { window_stats_[i].Clear(); } current_window_.store(0, std::memory_order_relaxed); - last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed); + last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); } bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); } @@ -129,7 +131,7 @@ void HistogramWindowingImpl::Data(HistogramData * const data) const { } void HistogramWindowingImpl::TimerTick() { - uint64_t curr_time = env_->NowMicros(); + uint64_t curr_time = clock_->NowMicros(); size_t curr_window_ = static_cast(current_window()); if (curr_time - last_swap_time() > micros_per_window_ && window_stats_[curr_window_].num() >= min_num_per_window_) { @@ -144,7 +146,7 @@ void HistogramWindowingImpl::SwapHistoryBucket() { // If mutex is held by Merge() or Clear(), next Add() will take care of the // swap, if needed. if (mutex_.try_lock()) { - last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed); + last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); uint64_t curr_window = current_window(); uint64_t next_window = (curr_window == num_windows_ - 1) ? diff --git a/monitoring/histogram_windowing.h b/monitoring/histogram_windowing.h index 72545b07f36..f8da07b3665 100644 --- a/monitoring/histogram_windowing.h +++ b/monitoring/histogram_windowing.h @@ -10,9 +10,9 @@ #pragma once #include "monitoring/histogram.h" -#include "rocksdb/env.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; class HistogramWindowingImpl : public Histogram { @@ -44,7 +44,13 @@ class HistogramWindowingImpl : public Histogram virtual double StandardDeviation() const override; virtual void Data(HistogramData* const data) const override; -private: +#ifndef NDEBUG + void TEST_UpdateClock(const std::shared_ptr& clock) { + clock_ = clock; + } +#endif // NDEBUG + + private: void TimerTick(); void SwapHistoryBucket(); inline uint64_t current_window() const { @@ -54,7 +60,7 @@ class HistogramWindowingImpl : public Histogram return last_swap_time_.load(std::memory_order_relaxed); } - Env* env_; + std::shared_ptr clock_; std::mutex mutex_; // Aggregated stats over windows_stats_, all the computation is done diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc index d82e594c1d0..adca63f263c 100644 --- a/monitoring/instrumented_mutex.cc +++ b/monitoring/instrumented_mutex.cc @@ -4,15 +4,17 @@ // (found in the LICENSE.Apache file in the root directory). #include "monitoring/instrumented_mutex.h" + #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" namespace ROCKSDB_NAMESPACE { namespace { #ifndef NPERF_CONTEXT -Statistics* stats_for_report(Env* env, Statistics* stats) { - if (env != nullptr && stats != nullptr && +Statistics* stats_for_report(SystemClock* clock, Statistics* stats) { + if (clock != nullptr && stats != nullptr && stats->get_stats_level() > kExceptTimeForMutex) { return stats; } else { @@ -25,7 +27,7 @@ Statistics* stats_for_report(Env* env, Statistics* stats) { void InstrumentedMutex::Lock() { PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, - stats_for_report(env_, stats_), stats_code_); + stats_for_report(clock_, stats_), stats_code_); LockInternal(); } @@ -39,7 +41,7 @@ void InstrumentedMutex::LockInternal() { void InstrumentedCondVar::Wait() { PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, - stats_for_report(env_, stats_), stats_code_); + stats_for_report(clock_, stats_), stats_code_); WaitInternal(); } @@ -53,7 +55,7 @@ void InstrumentedCondVar::WaitInternal() { bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) { PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD( db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS, - stats_for_report(env_, stats_), stats_code_); + stats_for_report(clock_, stats_), stats_code_); return TimedWaitInternal(abs_time_us); } diff --git a/monitoring/instrumented_mutex.h b/monitoring/instrumented_mutex.h index 50c1f29c8a5..19af1b4731a 100644 --- a/monitoring/instrumented_mutex.h +++ b/monitoring/instrumented_mutex.h @@ -7,8 +7,8 @@ #include "monitoring/statistics.h" #include "port/port.h" -#include "rocksdb/env.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" #include "rocksdb/thread_status.h" #include "util/stop_watch.h" @@ -20,13 +20,16 @@ class InstrumentedCondVar; class InstrumentedMutex { public: explicit InstrumentedMutex(bool adaptive = false) - : mutex_(adaptive), stats_(nullptr), env_(nullptr), - stats_code_(0) {} + : mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {} - InstrumentedMutex( - Statistics* stats, Env* env, - int stats_code, bool adaptive = false) - : mutex_(adaptive), stats_(stats), env_(env), + explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false) + : mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {} + + InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code, + bool adaptive = false) + : mutex_(adaptive), + stats_(stats), + clock_(clock), stats_code_(stats_code) {} void Lock(); @@ -44,7 +47,7 @@ class InstrumentedMutex { friend class InstrumentedCondVar; port::Mutex mutex_; Statistics* stats_; - Env* env_; + SystemClock* clock_; int stats_code_; }; @@ -71,7 +74,7 @@ class InstrumentedCondVar { explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex) : cond_(&(instrumented_mutex->mutex_)), stats_(instrumented_mutex->stats_), - env_(instrumented_mutex->env_), + clock_(instrumented_mutex->clock_), stats_code_(instrumented_mutex->stats_code_) {} void Wait(); @@ -91,7 +94,7 @@ class InstrumentedCondVar { bool TimedWaitInternal(uint64_t abs_time_us); port::CondVar cond_; Statistics* stats_; - Env* env_; + SystemClock* clock_; int stats_code_; }; diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 2960f05e8cc..23bf3a694f0 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -9,19 +9,23 @@ namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +#ifdef NIOSTATS_CONTEXT +// Should not be used because the counters are not thread-safe. +// Put here just to make get_iostats_context() simple without ifdef. +static IOStatsContext iostats_context; +#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL) __thread IOStatsContext iostats_context; +#else +#error \ + "No thread-local support. Disable iostats context with -DNIOSTATS_CONTEXT." #endif IOStatsContext* get_iostats_context() { -#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL return &iostats_context; -#else - return nullptr; -#endif } void IOStatsContext::Reset() { +#ifndef NIOSTATS_CONTEXT thread_pool_id = Env::Priority::TOTAL; bytes_read = 0; bytes_written = 0; @@ -33,6 +37,9 @@ void IOStatsContext::Reset() { prepare_write_nanos = 0; fsync_nanos = 0; logger_nanos = 0; + cpu_write_nanos = 0; + cpu_read_nanos = 0; +#endif //! NIOSTATS_CONTEXT } #define IOSTATS_CONTEXT_OUTPUT(counter) \ @@ -41,6 +48,10 @@ void IOStatsContext::Reset() { } std::string IOStatsContext::ToString(bool exclude_zero_counters) const { +#ifdef NIOSTATS_CONTEXT + (void)exclude_zero_counters; + return ""; +#else std::ostringstream ss; IOSTATS_CONTEXT_OUTPUT(thread_pool_id); IOSTATS_CONTEXT_OUTPUT(bytes_read); @@ -53,10 +64,13 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const { IOSTATS_CONTEXT_OUTPUT(fsync_nanos); IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos); IOSTATS_CONTEXT_OUTPUT(logger_nanos); + IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos); + IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos); std::string str = ss.str(); str.erase(str.find_last_not_of(", ") + 1); return str; +#endif //! NIOSTATS_CONTEXT } } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/iostats_context_imp.h b/monitoring/iostats_context_imp.h index a7f095d6ef1..69b0c659071 100644 --- a/monitoring/iostats_context_imp.h +++ b/monitoring/iostats_context_imp.h @@ -7,7 +7,7 @@ #include "monitoring/perf_step_timer.h" #include "rocksdb/iostats_context.h" -#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL) && !defined(NIOSTATS_CONTEXT) namespace ROCKSDB_NAMESPACE { extern __thread IOStatsContext iostats_context; } // namespace ROCKSDB_NAMESPACE @@ -38,13 +38,13 @@ extern __thread IOStatsContext iostats_context; iostats_step_timer_##metric.Start(); // Declare and set start time of the timer -#define IOSTATS_CPU_TIMER_GUARD(metric, env) \ +#define IOSTATS_CPU_TIMER_GUARD(metric, clock) \ PerfStepTimer iostats_step_timer_##metric( \ - &(iostats_context.metric), env, true, \ + &(iostats_context.metric), clock, true, \ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ iostats_step_timer_##metric.Start(); -#else // ROCKSDB_SUPPORT_THREAD_LOCAL +#else // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT #define IOSTATS_ADD(metric, value) #define IOSTATS_ADD_IF_POSITIVE(metric, value) @@ -55,6 +55,6 @@ extern __thread IOStatsContext iostats_context; #define IOSTATS(metric) 0 #define IOSTATS_TIMER_GUARD(metric) -#define IOSTATS_CPU_TIMER_GUARD(metric, env) static_cast(env) +#define IOSTATS_CPU_TIMER_GUARD(metric, clock) static_cast(clock) -#endif // ROCKSDB_SUPPORT_THREAD_LOCAL +#endif // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 53f5024050a..d45d84fb6e3 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -9,26 +9,22 @@ namespace ROCKSDB_NAMESPACE { -#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL) +#if defined(NPERF_CONTEXT) +// Should not be used because the counters are not thread-safe. +// Put here just to make get_perf_context() simple without ifdef. PerfContext perf_context; -#else +#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL) #if defined(OS_SOLARIS) -__thread PerfContext perf_context_; -#else +__thread PerfContext perf_context; +#else // OS_SOLARIS thread_local PerfContext perf_context; -#endif +#endif // OS_SOLARIS +#else +#error "No thread-local support. Disable perf context with -DNPERF_CONTEXT." #endif PerfContext* get_perf_context() { -#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL) - return &perf_context; -#else -#if defined(OS_SOLARIS) - return &perf_context_; -#else return &perf_context; -#endif -#endif } PerfContext::~PerfContext() { diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index cdca27621ec..b7a56adef59 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -25,8 +25,8 @@ extern thread_local PerfContext perf_context; #define PERF_TIMER_STOP(metric) #define PERF_TIMER_START(metric) #define PERF_TIMER_GUARD(metric) -#define PERF_TIMER_GUARD_WITH_ENV(metric, env) -#define PERF_CPU_TIMER_GUARD(metric, env) +#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock) +#define PERF_CPU_TIMER_GUARD(metric, clock) #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ ticker_type) #define PERF_TIMER_MEASURE(metric) @@ -46,14 +46,14 @@ extern thread_local PerfContext perf_context; perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \ +#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), clock); \ perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_CPU_TIMER_GUARD(metric, env) \ +#define PERF_CPU_TIMER_GUARD(metric, clock) \ PerfStepTimer perf_step_timer_##metric( \ - &(perf_context.metric), env, true, \ + &(perf_context.metric), clock, true, \ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ perf_step_timer_##metric.Start(); diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index f2d35d9d62d..fb049f7252b 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -5,26 +5,26 @@ // #pragma once #include "monitoring/perf_level_imp.h" -#include "rocksdb/env.h" -#include "util/stop_watch.h" +#include "monitoring/statistics.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { class PerfStepTimer { public: explicit PerfStepTimer( - uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false, + uint64_t* metric, SystemClock* clock = nullptr, bool use_cpu_time = false, PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, Statistics* statistics = nullptr, uint32_t ticker_type = 0) : perf_counter_enabled_(perf_level >= enable_level), use_cpu_time_(use_cpu_time), - env_((perf_counter_enabled_ || statistics != nullptr) - ? ((env != nullptr) ? env : Env::Default()) - : nullptr), + ticker_type_(ticker_type), + clock_((perf_counter_enabled_ || statistics != nullptr) + ? (clock ? clock : SystemClock::Default().get()) + : nullptr), start_(0), metric_(metric), - statistics_(statistics), - ticker_type_(ticker_type) {} + statistics_(statistics) {} ~PerfStepTimer() { Stop(); @@ -36,14 +36,6 @@ class PerfStepTimer { } } - uint64_t time_now() { - if (!use_cpu_time_) { - return env_->NowNanos(); - } else { - return env_->NowCPUNanos(); - } - } - void Measure() { if (start_) { uint64_t now = time_now(); @@ -67,13 +59,21 @@ class PerfStepTimer { } private: + uint64_t time_now() { + if (!use_cpu_time_) { + return clock_->NowNanos(); + } else { + return clock_->CPUNanos(); + } + } + const bool perf_counter_enabled_; const bool use_cpu_time_; - Env* const env_; + uint32_t ticker_type_; + SystemClock* const clock_; uint64_t start_; uint64_t* metric_; Statistics* statistics_; - uint32_t ticker_type_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc index 7cc869cf219..86fe98f1fc0 100644 --- a/monitoring/persistent_stats_history.cc +++ b/monitoring/persistent_stats_history.cc @@ -12,7 +12,6 @@ #include #include #include "db/db_impl/db_impl.h" -#include "port/likely.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 2c7f65a89da..18d8eb16079 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -8,7 +8,6 @@ #include #include #include -#include "port/likely.h" #include "rocksdb/statistics.h" namespace ROCKSDB_NAMESPACE { @@ -192,6 +191,20 @@ const std::vector> TickersNameMap = { "rocksdb.block.cache.compression.dict.add.redundant"}, {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, + {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"}, + {ERROR_HANDLER_BG_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.io.errro.count"}, + {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.retryable.io.errro.count"}, + {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"}, + {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, + "rocksdb.error.handler.autoresume.retry.total.count"}, + {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, + "rocksdb.error.handler.autoresume.success.count"}, + {MEMTABLE_PAYLOAD_BYTES_AT_FLUSH, + "rocksdb.memtable.payload.bytes.at.flush"}, + {MEMTABLE_GARBAGE_BYTES_AT_FLUSH, + "rocksdb.memtable.garbage.bytes.at.flush"}, }; const std::vector> HistogramsNameMap = { @@ -247,6 +260,8 @@ const std::vector> HistogramsNameMap = { "rocksdb.num.index.and.filter.blocks.read.per.level"}, {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"}, {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, + {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, + "rocksdb.error.handler.autoresume.retry.count"}, }; std::shared_ptr CreateDBStatistics() { diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index a1affb6d193..c1a2ad989f1 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -22,6 +22,7 @@ #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/rate_limiter.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testutil.h" #include "util/random.h" @@ -32,20 +33,23 @@ namespace ROCKSDB_NAMESPACE { class StatsHistoryTest : public DBTestBase { public: StatsHistoryTest() - : DBTestBase("/stats_history_test", /*env_do_fsync=*/true), - mock_env_(new MockTimeEnv(Env::Default())) {} + : DBTestBase("/stats_history_test", /*env_do_fsync=*/true) { + mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_)); + } protected: - std::unique_ptr mock_env_; + std::shared_ptr mock_clock_; + std::unique_ptr mock_env_; void SetUp() override { - mock_env_->InstallTimedWaitFixCallback(); + mock_clock_->InstallTimedWaitFixCallback(); SyncPoint::GetInstance()->SetCallBack( "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) { auto* periodic_work_scheduler_ptr = reinterpret_cast(arg); *periodic_work_scheduler_ptr = - PeriodicWorkTestScheduler::Default(mock_env_.get()); + PeriodicWorkTestScheduler::Default(mock_clock_); }); } }; @@ -65,17 +69,17 @@ TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) { // Wait for the first stats persist to finish, as the initial delay could be // different. dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); // Test cancel job through SetOptions ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})); int old_val = counter; for (int i = 1; i < 20; ++i) { - mock_env_->MockSleepForSeconds(kPeriodSec); + mock_clock_->MockSleepForSeconds(kPeriodSec); } ASSERT_EQ(counter, old_val); Close(); @@ -97,17 +101,17 @@ TEST_F(StatsHistoryTest, StatsPersistScheduling) { // Wait for the first stats persist to finish, as the initial delay could be // different. dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); // Test cancel job through SetOptions ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); int old_val = counter; dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec * 2); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec * 2); }); ASSERT_EQ(counter, old_val); Close(); @@ -129,7 +133,7 @@ TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) { ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); Close(); } @@ -148,30 +152,31 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { // make sure the first stats persist to finish dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); // Wait for stats persist to finish dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); std::unique_ptr stats_iter; - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); // disabled stats snapshots ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); size_t stats_count = 0; for (; stats_iter->Valid(); stats_iter->Next()) { auto stats_map = stats_iter->GetStatsMap(); - ASSERT_EQ(stats_iter->GetStatsTime(), mock_env_->NowSeconds()); + ASSERT_EQ(stats_iter->GetStatsTime(), mock_clock_->NowSeconds()); stats_count += stats_map.size(); } ASSERT_GT(stats_count, 0); // Wait a bit and verify no more stats are found for (int i = 0; i < 10; ++i) { dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(1); }); + [&] { mock_clock_->MockSleepForSeconds(1); }); } - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds(), &stats_iter)); + ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count_new = 0; for (; stats_iter->Valid(); stats_iter->Next()) { @@ -224,11 +229,12 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { const int kIterations = 10; for (int i = 0; i < kIterations; ++i) { dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); } std::unique_ptr stats_iter; - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count = 0; int slice_count = 0; @@ -239,18 +245,19 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { } size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize(); ASSERT_GE(slice_count, kIterations - 1); - ASSERT_GE(stats_history_size, 13000); - // capping memory cost at 13000 bytes since one slice is around 10000~13000 - ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "13000"}})); - ASSERT_EQ(13000, dbfull()->GetDBOptions().stats_history_buffer_size); + ASSERT_GE(stats_history_size, 14000); + // capping memory cost at 14000 bytes since one slice is around 10000~14000 + ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "14000"}})); + ASSERT_EQ(14000, dbfull()->GetDBOptions().stats_history_buffer_size); // Wait for stats persist to finish for (int i = 0; i < kIterations; ++i) { dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); } - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count_reopen = 0; slice_count = 0; @@ -295,11 +302,11 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { // Wait for the first stats persist to finish, as the initial delay could be // different. dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); // Wait for stats persist to finish dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); @@ -307,14 +314,14 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { delete iter; dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count2 = countkeys(iter); delete iter; dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count3 = countkeys(iter); @@ -323,7 +330,8 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { ASSERT_GE(key_count3, key_count2); ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1); std::unique_ptr stats_iter; - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count = 0; int slice_count = 0; @@ -344,7 +352,8 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { ASSERT_EQ(stats_count, key_count3 - 2); // verify reopen will not cause data loss ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); size_t stats_count_reopen = 0; int slice_count_reopen = 0; @@ -386,37 +395,38 @@ TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) { // Wait for the first stats persist to finish, as the initial delay could be // different. dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); // Wait for stats persist to finish dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); std::map stats_map_after; ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after)); std::unique_ptr stats_iter; - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); std::string sample = "rocksdb.num.iterator.deleted"; uint64_t recovered_value = 0; @@ -433,7 +443,8 @@ TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) { // test stats value retains after recovery ReopenWithColumnFamilies({"default", "pikachu"}, options); - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter)); + ASSERT_OK( + db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); uint64_t new_recovered_value = 0; for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) { @@ -473,10 +484,10 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { // make sure the first stats persist to finish dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count = countkeys(iter); @@ -485,7 +496,7 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { uint64_t num_write_wal = 0; std::string sample = "rocksdb.write.wal"; std::unique_ptr stats_iter; - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds(), &stats_iter)); + ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); for (; stats_iter->Valid(); stats_iter->Next()) { auto stats_map = stats_iter->GetStatsMap(); @@ -521,7 +532,7 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName, &handle)); // verify stats is not affected by prior failed CF creation - ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds(), &stats_iter)); + ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); ASSERT_TRUE(stats_iter != nullptr); num_write_wal = 0; for (; stats_iter->Valid(); stats_iter->Next()) { @@ -552,7 +563,7 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { // Reopen and flush memtable. ASSERT_OK(TryReopen(options)); - Flush(); + ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); @@ -573,7 +584,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { // Wait for the first stats persist to finish, as the initial delay could be // different. dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); ColumnFamilyData* cfd_default = static_cast(dbfull()->DefaultColumnFamily()) @@ -592,7 +603,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { ASSERT_EQ("v0", Get(1, "Eevee")); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flush default cf // LogNumbers: default: 14, stats: 4, pikachu: 4 ASSERT_OK(Flush()); @@ -617,7 +628,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { ASSERT_EQ("v2", Get("foo2")); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to default and stats cf, flushing default cf // LogNumbers: default: 19, stats: 19, pikachu: 19 ASSERT_OK(Flush()); @@ -632,7 +643,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { ASSERT_EQ("v3", Get(1, "Jolteon")); dbfull()->TEST_WaitForStatsDumpRun( - [&] { mock_env_->MockSleepForSeconds(kPeriodSec); }); + [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flushing test cf // LogNumbers: default: 19, stats: 19, pikachu: 22 ASSERT_OK(Flush(1)); diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc index 7e4b299a8c0..267a0c0b082 100644 --- a/monitoring/thread_status_updater.cc +++ b/monitoring/thread_status_updater.cc @@ -4,9 +4,12 @@ // (found in the LICENSE.Apache file in the root directory). #include "monitoring/thread_status_updater.h" + #include + #include "port/likely.h" #include "rocksdb/env.h" +#include "rocksdb/system_clock.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { @@ -159,7 +162,7 @@ Status ThreadStatusUpdater::GetThreadList( std::vector* thread_list) { thread_list->clear(); std::vector> valid_list; - uint64_t now_micros = Env::Default()->NowMicros(); + uint64_t now_micros = SystemClock::Default()->NowMicros(); std::lock_guard lck(thread_list_mutex_); for (auto* thread_data : thread_data_set_) { diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index 13a79163c3e..37fcb9f4869 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -7,6 +7,7 @@ #include "monitoring/thread_status_updater.h" #include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -57,7 +58,7 @@ void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { } if (op != ThreadStatus::OP_UNKNOWN) { - uint64_t current_time = Env::Default()->NowMicros(); + uint64_t current_time = SystemClock::Default()->NowMicros(); thread_updater_local_cache_->SetOperationStartTime(current_time); } else { // TDOO(yhchiang): we could report the time when we set operation to diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc index 375fe8c0ae0..c493ddca542 100644 --- a/monitoring/thread_status_util_debug.cc +++ b/monitoring/thread_status_util_debug.cc @@ -7,7 +7,7 @@ #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" -#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -23,7 +23,7 @@ void ThreadStatusUtil::TEST_SetStateDelay( void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) { auto delay = states_delay[state].load(std::memory_order_relaxed); if (delay > 0) { - Env::Default()->SleepForMicroseconds(delay); + SystemClock::Default()->SleepForMicroseconds(delay); } } diff --git a/options/cf_options.cc b/options/cf_options.cc index fb56f238857..005a90c8554 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -38,96 +38,94 @@ namespace ROCKSDB_NAMESPACE { // http://en.cppreference.com/w/cpp/concept/StandardLayoutType // https://gist.github.com/graphitemaster/494f21190bb2c63c5516 #ifndef ROCKSDB_LITE -static ColumnFamilyOptions dummy_cf_options; +static ImmutableCFOptions dummy_cf_options; template -int offset_of(T1 ColumnFamilyOptions::*member) { - return int(size_t(&(dummy_cf_options.*member)) - size_t(&dummy_cf_options)); -} -template -int offset_of(T1 AdvancedColumnFamilyOptions::*member) { +int offset_of(T1 ImmutableCFOptions::*member) { return int(size_t(&(dummy_cf_options.*member)) - size_t(&dummy_cf_options)); } static Status ParseCompressionOptions(const std::string& value, const std::string& name, CompressionOptions& compression_opts) { - size_t start = 0; - size_t end = value.find(':'); - if (end == std::string::npos) { + const char kDelimiter = ':'; + std::istringstream field_stream(value); + std::string field; + + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument("unable to parse the specified CF option " + name); } - compression_opts.window_bits = ParseInt(value.substr(start, end - start)); - start = end + 1; - end = value.find(':', start); - if (end == std::string::npos) { + compression_opts.window_bits = ParseInt(field); + + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument("unable to parse the specified CF option " + name); } - compression_opts.level = ParseInt(value.substr(start, end - start)); - start = end + 1; - if (start >= value.size()) { + compression_opts.level = ParseInt(field); + + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument("unable to parse the specified CF option " + name); } - end = value.find(':', start); - compression_opts.strategy = - ParseInt(value.substr(start, value.size() - start)); + compression_opts.strategy = ParseInt(field); + // max_dict_bytes is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument( "unable to parse the specified CF option " + name); } - compression_opts.max_dict_bytes = - ParseInt(value.substr(start, value.size() - start)); - end = value.find(':', start); + compression_opts.max_dict_bytes = ParseInt(field); } + // zstd_max_train_bytes is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument( "unable to parse the specified CF option " + name); } - compression_opts.zstd_max_train_bytes = - ParseInt(value.substr(start, value.size() - start)); - end = value.find(':', start); + compression_opts.zstd_max_train_bytes = ParseInt(field); } // parallel_threads is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument( "unable to parse the specified CF option " + name); } // Since parallel_threads comes before enabled but was added optionally // later, we need to check if this is the final token (meaning it is the - // enabled bit), or if there is another token (meaning this one is - // parallel_threads) - end = value.find(':', start); - if (end != std::string::npos) { - compression_opts.parallel_threads = - ParseInt(value.substr(start, value.size() - start)); + // enabled bit), or if there are more tokens (meaning this one is + // parallel_threads). + if (!field_stream.eof()) { + compression_opts.parallel_threads = ParseInt(field); } else { // parallel_threads is not serialized with this format, but enabled is - compression_opts.parallel_threads = CompressionOptions().parallel_threads; - compression_opts.enabled = - ParseBoolean("", value.substr(start, value.size() - start)); + compression_opts.enabled = ParseBoolean("", field); } } // enabled is optional for backwards compatibility - if (end != std::string::npos) { - start = end + 1; - if (start >= value.size()) { + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { return Status::InvalidArgument( "unable to parse the specified CF option " + name); } - compression_opts.enabled = - ParseBoolean("", value.substr(start, value.size() - start)); + compression_opts.enabled = ParseBoolean("", field); + } + + // max_dict_buffer_bytes is optional for backwards compatibility + if (!field_stream.eof()) { + if (!std::getline(field_stream, field, kDelimiter)) { + return Status::InvalidArgument( + "unable to parse the specified CF option " + name); + } + compression_opts.max_dict_buffer_bytes = ParseUint64(field); + } + + if (!field_stream.eof()) { + return Status::InvalidArgument("unable to parse the specified CF option " + + name); } return Status::OK(); } @@ -161,6 +159,10 @@ static std::unordered_map {"enabled", {offsetof(struct CompressionOptions, enabled), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_dict_buffer_bytes", + {offsetof(struct CompressionOptions, max_dict_buffer_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, }; static std::unordered_map @@ -362,7 +364,7 @@ static std::unordered_map offsetof(struct MutableCFOptions, compaction_options_fifo), OptionVerificationType::kNormal, OptionTypeFlags::kMutable, [](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { // This is to handle backward compatibility, where // compaction_options_fifo could be assigned a single scalar // value, say, like "23", which would be assigned to @@ -370,7 +372,7 @@ static std::unordered_map if (name == "compaction_options_fifo" && value.find("=") == std::string::npos) { // Old format. Parse just a single uint64_t value. - auto options = reinterpret_cast(addr); + auto options = static_cast(addr); options->max_table_files_size = ParseUint64(value); return Status::OK(); } else { @@ -408,6 +410,14 @@ static std::unordered_map {offsetof(struct MutableCFOptions, blob_compression_type), OptionType::kCompressionType, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"enable_blob_garbage_collection", + {offsetof(struct MutableCFOptions, enable_blob_garbage_collection), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_age_cutoff", + {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"sample_for_compression", {offsetof(struct MutableCFOptions, sample_for_compression), OptionType::kUInt64T, OptionVerificationType::kNormal, @@ -423,13 +433,12 @@ static std::unordered_map OptionVerificationType::kNormal, (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever), [](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { // This is to handle backward compatibility, where // compression_options was a ":" separated list. if (name == kOptNameCompOpts && value.find("=") == std::string::npos) { - auto* compression = - reinterpret_cast(addr); + auto* compression = static_cast(addr); return ParseCompressionOptions(value, name, *compression); } else { return OptionTypeInfo::ParseStruct( @@ -444,13 +453,12 @@ static std::unordered_map OptionVerificationType::kNormal, (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever), [](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { // This is to handle backward compatibility, where // compression_options was a ":" separated list. if (name == kOptNameBMCompOpts && value.find("=") == std::string::npos) { - auto* compression = - reinterpret_cast(addr); + auto* compression = static_cast(addr); return ParseCompressionOptions(value, name, *compression); } else { return OptionTypeInfo::ParseStruct( @@ -478,64 +486,65 @@ static std::unordered_map {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"inplace_update_support", - {offset_of(&ColumnFamilyOptions::inplace_update_support), + {offset_of(&ImmutableCFOptions::inplace_update_support), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"level_compaction_dynamic_level_bytes", - {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes), + {offset_of(&ImmutableCFOptions::level_compaction_dynamic_level_bytes), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"optimize_filters_for_hits", - {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits), + {offset_of(&ImmutableCFOptions::optimize_filters_for_hits), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"force_consistency_checks", - {offset_of(&ColumnFamilyOptions::force_consistency_checks), + {offset_of(&ImmutableCFOptions::force_consistency_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"purge_redundant_kvs_while_flush", - {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush), + {offset_of(&ImmutableCFOptions::purge_redundant_kvs_while_flush), OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"max_mem_compaction_level", {0, OptionType::kInt, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"max_write_buffer_number_to_maintain", - {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain), + {offset_of(&ImmutableCFOptions::max_write_buffer_number_to_maintain), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone, 0}}, {"max_write_buffer_size_to_maintain", - {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain), + {offset_of(&ImmutableCFOptions::max_write_buffer_size_to_maintain), OptionType::kInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"min_write_buffer_number_to_merge", - {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge), + {offset_of(&ImmutableCFOptions::min_write_buffer_number_to_merge), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone, 0}}, {"num_levels", - {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt, + {offset_of(&ImmutableCFOptions::num_levels), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"bloom_locality", - {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T, + {offset_of(&ImmutableCFOptions::bloom_locality), OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"rate_limit_delay_max_milliseconds", {0, OptionType::kUInt, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, {"compression_per_level", OptionTypeInfo::Vector( - offset_of(&ColumnFamilyOptions::compression_per_level), + offset_of(&ImmutableCFOptions::compression_per_level), OptionVerificationType::kNormal, OptionTypeFlags::kNone, {0, OptionType::kCompressionType})}, {"comparator", - {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator, - OptionVerificationType::kByName, OptionTypeFlags::kCompareLoose, + {offset_of(&ImmutableCFOptions::user_comparator), + OptionType::kComparator, OptionVerificationType::kByName, + OptionTypeFlags::kCompareLoose, // Parses the string and sets the corresponding comparator - [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto old_comparator = reinterpret_cast(addr); + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto old_comparator = static_cast(addr); const Comparator* new_comparator = *old_comparator; - Status status = ObjectRegistry::NewInstance()->NewStaticObject( - value, &new_comparator); + Status status = + opts.registry->NewStaticObject(value, &new_comparator); if (status.ok()) { *old_comparator = new_comparator; return status; @@ -544,61 +553,45 @@ static std::unordered_map }}}, {"memtable_insert_with_hint_prefix_extractor", {offset_of( - &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor), + &ImmutableCFOptions::memtable_insert_with_hint_prefix_extractor), OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull, OptionTypeFlags::kNone}}, {"memtable_factory", - {offset_of(&ColumnFamilyOptions::memtable_factory), + {offset_of(&ImmutableCFOptions::memtable_factory), OptionType::kMemTableRepFactory, OptionVerificationType::kByName, OptionTypeFlags::kNone}}, {"memtable", - {offset_of(&ColumnFamilyOptions::memtable_factory), + {offset_of(&ImmutableCFOptions::memtable_factory), OptionType::kMemTableRepFactory, OptionVerificationType::kAlias, OptionTypeFlags::kNone, // Parses the value string and updates the memtable_factory [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { + const std::string& value, void* addr) { std::unique_ptr new_mem_factory; Status s = GetMemTableRepFactoryFromString(value, &new_mem_factory); if (s.ok()) { auto memtable_factory = - reinterpret_cast*>(addr); + static_cast*>(addr); memtable_factory->reset(new_mem_factory.release()); } return s; }}}, - {"table_factory", - {offset_of(&ColumnFamilyOptions::table_factory), - OptionType::kConfigurable, OptionVerificationType::kByName, - (OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose | - OptionTypeFlags::kDontPrepare), - // Creates a new TableFactory based on value - [](const ConfigOptions& opts, const std::string& /*name*/, - const std::string& value, char* addr) { - auto table_factory = - reinterpret_cast*>(addr); - return TableFactory::CreateFromString(opts, value, table_factory); - }, - // Converts the TableFactory into its string representation - [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const char* addr, std::string* value) { - const auto* table_factory = - reinterpret_cast*>(addr); - *value = table_factory->get() ? table_factory->get()->Name() - : kNullptrString; - return Status::OK(); - }, - /* No equals function for table factories */ nullptr}}, + {"table_factory", OptionTypeInfo::AsCustomSharedPtr( + offset_of(&ImmutableCFOptions::table_factory), + OptionVerificationType::kByName, + (OptionTypeFlags::kCompareLoose | + OptionTypeFlags::kStringNameOnly | + OptionTypeFlags::kDontPrepare))}, {"block_based_table_factory", - {offset_of(&ColumnFamilyOptions::table_factory), - OptionType::kConfigurable, OptionVerificationType::kAlias, + {offset_of(&ImmutableCFOptions::table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, // Parses the input value and creates a BlockBasedTableFactory [](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { BlockBasedTableOptions* old_opts = nullptr; auto table_factory = - reinterpret_cast*>(addr); + static_cast*>(addr); if (table_factory->get() != nullptr) { old_opts = table_factory->get()->GetOptions(); @@ -622,15 +615,15 @@ static std::unordered_map } }}}, {"plain_table_factory", - {offset_of(&ColumnFamilyOptions::table_factory), - OptionType::kConfigurable, OptionVerificationType::kAlias, + {offset_of(&ImmutableCFOptions::table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, // Parses the input value and creates a PlainTableFactory [](const ConfigOptions& opts, const std::string& name, - const std::string& value, char* addr) { + const std::string& value, void* addr) { PlainTableOptions* old_opts = nullptr; auto table_factory = - reinterpret_cast*>(addr); + static_cast*>(addr); if (table_factory->get() != nullptr) { old_opts = table_factory->get()->GetOptions(); } @@ -653,25 +646,24 @@ static std::unordered_map } }}}, {"compaction_filter", - {offset_of(&ColumnFamilyOptions::compaction_filter), + {offset_of(&ImmutableCFOptions::compaction_filter), OptionType::kCompactionFilter, OptionVerificationType::kByName, OptionTypeFlags::kNone}}, {"compaction_filter_factory", - {offset_of(&ColumnFamilyOptions::compaction_filter_factory), + {offset_of(&ImmutableCFOptions::compaction_filter_factory), OptionType::kCompactionFilterFactory, OptionVerificationType::kByName, OptionTypeFlags::kNone}}, {"merge_operator", - {offset_of(&ColumnFamilyOptions::merge_operator), + {offset_of(&ImmutableCFOptions::merge_operator), OptionType::kMergeOperator, OptionVerificationType::kByNameAllowFromNull, OptionTypeFlags::kCompareLoose, // Parses the input value as a MergeOperator, updating the value - [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto mop = reinterpret_cast*>(addr); + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto mop = static_cast*>(addr); Status status = - ObjectRegistry::NewInstance()->NewSharedObject( - value, mop); + opts.registry->NewSharedObject(value, mop); // Only support static comparator for now. if (status.ok()) { return status; @@ -679,11 +671,11 @@ static std::unordered_map return Status::OK(); }}}, {"compaction_style", - {offset_of(&ColumnFamilyOptions::compaction_style), + {offset_of(&ImmutableCFOptions::compaction_style), OptionType::kCompactionStyle, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"compaction_pri", - {offset_of(&ColumnFamilyOptions::compaction_pri), + {offset_of(&ImmutableCFOptions::compaction_pri), OptionType::kCompactionPri, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, }; @@ -692,10 +684,9 @@ const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions"; class ConfigurableMutableCFOptions : public Configurable { public: - ConfigurableMutableCFOptions(const MutableCFOptions& mcf) { + explicit ConfigurableMutableCFOptions(const MutableCFOptions& mcf) { mutable_ = mcf; - ConfigurableHelper::RegisterOptions(*this, &mutable_, - &cf_mutable_options_type_info); + RegisterOptions(&mutable_, &cf_mutable_options_type_info); } protected: @@ -710,9 +701,7 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions { immutable_(opts), cf_options_(opts), opt_map_(map) { - ConfigurableHelper::RegisterOptions(*this, OptionsHelper::kCFOptionsName, - &immutable_, - &cf_immutable_options_type_info); + RegisterOptions(&immutable_, &cf_immutable_options_type_info); } protected: @@ -720,10 +709,10 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions { const ConfigOptions& config_options, const std::unordered_map& opts_map, std::unordered_map* unused) override { - Status s = ConfigurableHelper::ConfigureOptions(config_options, *this, - opts_map, unused); + Status s = Configurable::ConfigureOptions(config_options, opts_map, unused); if (s.ok()) { - cf_options_ = BuildColumnFamilyOptions(immutable_, mutable_); + UpdateColumnFamilyOptions(mutable_, &cf_options_); + UpdateColumnFamilyOptions(immutable_, &cf_options_); s = PrepareOptions(config_options); } return s; @@ -777,7 +766,7 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions { } private: - ColumnFamilyOptions immutable_; + ImmutableCFOptions immutable_; ColumnFamilyOptions cf_options_; const std::unordered_map* opt_map_; }; @@ -795,18 +784,16 @@ std::unique_ptr CFOptionsAsConfigurable( } #endif // ROCKSDB_LITE -ImmutableCFOptions::ImmutableCFOptions(const Options& options) - : ImmutableCFOptions(ImmutableDBOptions(options), options) {} +ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {} -ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& cf_options) +ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) : compaction_style(cf_options.compaction_style), compaction_pri(cf_options.compaction_pri), user_comparator(cf_options.comparator), internal_comparator(InternalKeyComparator(cf_options.comparator)), - merge_operator(cf_options.merge_operator.get()), + merge_operator(cf_options.merge_operator), compaction_filter(cf_options.compaction_filter), - compaction_filter_factory(cf_options.compaction_filter_factory.get()), + compaction_filter_factory(cf_options.compaction_filter_factory), min_write_buffer_number_to_merge( cf_options.min_write_buffer_number_to_merge), max_write_buffer_number_to_maintain( @@ -815,45 +802,45 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options, cf_options.max_write_buffer_size_to_maintain), inplace_update_support(cf_options.inplace_update_support), inplace_callback(cf_options.inplace_callback), - info_log(db_options.info_log.get()), - statistics(db_options.statistics.get()), - rate_limiter(db_options.rate_limiter.get()), - info_log_level(db_options.info_log_level), - env(db_options.env), - fs(db_options.fs.get()), - allow_mmap_reads(db_options.allow_mmap_reads), - allow_mmap_writes(db_options.allow_mmap_writes), - db_paths(db_options.db_paths), - memtable_factory(cf_options.memtable_factory.get()), - table_factory(cf_options.table_factory.get()), + memtable_factory(cf_options.memtable_factory), + table_factory(cf_options.table_factory), table_properties_collector_factories( cf_options.table_properties_collector_factories), - advise_random_on_open(db_options.advise_random_on_open), bloom_locality(cf_options.bloom_locality), purge_redundant_kvs_while_flush( cf_options.purge_redundant_kvs_while_flush), - use_fsync(db_options.use_fsync), compression_per_level(cf_options.compression_per_level), level_compaction_dynamic_level_bytes( cf_options.level_compaction_dynamic_level_bytes), - access_hint_on_compaction_start( - db_options.access_hint_on_compaction_start), - new_table_reader_for_compaction_inputs( - db_options.new_table_reader_for_compaction_inputs), num_levels(cf_options.num_levels), optimize_filters_for_hits(cf_options.optimize_filters_for_hits), force_consistency_checks(cf_options.force_consistency_checks), - allow_ingest_behind(db_options.allow_ingest_behind), - preserve_deletes(db_options.preserve_deletes), - listeners(db_options.listeners), - row_cache(db_options.row_cache), memtable_insert_with_hint_prefix_extractor( - cf_options.memtable_insert_with_hint_prefix_extractor.get()), + cf_options.memtable_insert_with_hint_prefix_extractor), cf_paths(cf_options.cf_paths), compaction_thread_limiter(cf_options.compaction_thread_limiter), - file_checksum_gen_factory(db_options.file_checksum_gen_factory.get()), - sst_partitioner_factory(cf_options.sst_partitioner_factory), - allow_data_in_errors(db_options.allow_data_in_errors) {} + sst_partitioner_factory(cf_options.sst_partitioner_factory) {} + +ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {} + +ImmutableOptions::ImmutableOptions(const Options& options) + : ImmutableOptions(options, options) {} + +ImmutableOptions::ImmutableOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const DBOptions& db_options, + const ImmutableCFOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} + +ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options, + const ImmutableCFOptions& cf_options) + : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {} // Multiple two operands. If they overflow, return op1. uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) { @@ -1021,9 +1008,38 @@ void MutableCFOptions::Dump(Logger* log) const { blob_file_size); ROCKS_LOG_INFO(log, " blob_compression_type: %s", CompressionTypeToString(blob_compression_type).c_str()); + ROCKS_LOG_INFO(log, " enable_blob_garbage_collection: %s", + enable_blob_garbage_collection ? "true" : "false"); + ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f", + blob_garbage_collection_age_cutoff); } MutableCFOptions::MutableCFOptions(const Options& options) : MutableCFOptions(ColumnFamilyOptions(options)) {} +#ifndef ROCKSDB_LITE +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + Logger* /*info_log*/, MutableCFOptions* new_options) { + assert(new_options); + *new_options = base_options; + ConfigOptions config_options; + Status s = OptionTypeInfo::ParseType( + config_options, options_map, cf_mutable_options_type_info, new_options); + if (!s.ok()) { + *new_options = base_options; + } + return s; +} + +Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, + const MutableCFOptions& mutable_opts, + std::string* opt_string) { + assert(opt_string); + opt_string->clear(); + return OptionTypeInfo::SerializeType( + config_options, cf_mutable_options_type_info, &mutable_opts, opt_string); +} +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/cf_options.h b/options/cf_options.h index ca086b5c80a..d4e77f04f7b 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -20,24 +20,23 @@ namespace ROCKSDB_NAMESPACE { // of DB. Raw pointers defined in this struct do not have ownership to the data // they point to. Options contains std::shared_ptr to these data. struct ImmutableCFOptions { + public: static const char* kName() { return "ImmutableCFOptions"; } - explicit ImmutableCFOptions(const Options& options); - - ImmutableCFOptions(const ImmutableDBOptions& db_options, - const ColumnFamilyOptions& cf_options); + explicit ImmutableCFOptions(); + explicit ImmutableCFOptions(const ColumnFamilyOptions& cf_options); CompactionStyle compaction_style; CompactionPri compaction_pri; const Comparator* user_comparator; - InternalKeyComparator internal_comparator; + InternalKeyComparator internal_comparator; // Only in Immutable - MergeOperator* merge_operator; + std::shared_ptr merge_operator; const CompactionFilter* compaction_filter; - CompactionFilterFactory* compaction_filter_factory; + std::shared_ptr compaction_filter_factory; int min_write_buffer_number_to_merge; @@ -52,78 +51,54 @@ struct ImmutableCFOptions { Slice delta_value, std::string* merged_value); - Logger* info_log; - - Statistics* statistics; - - RateLimiter* rate_limiter; - - InfoLogLevel info_log_level; - - Env* env; - - FileSystem* fs; + std::shared_ptr memtable_factory; - // Allow the OS to mmap file for reading sst tables. Default: false - bool allow_mmap_reads; - - // Allow the OS to mmap file for writing. Default: false - bool allow_mmap_writes; - - std::vector db_paths; - - MemTableRepFactory* memtable_factory; - - TableFactory* table_factory; + std::shared_ptr table_factory; Options::TablePropertiesCollectorFactories table_properties_collector_factories; - bool advise_random_on_open; - // This options is required by PlainTableReader. May need to move it // to PlainTableOptions just like bloom_bits_per_key uint32_t bloom_locality; bool purge_redundant_kvs_while_flush; - bool use_fsync; - std::vector compression_per_level; bool level_compaction_dynamic_level_bytes; - Options::AccessHint access_hint_on_compaction_start; - - bool new_table_reader_for_compaction_inputs; - int num_levels; bool optimize_filters_for_hits; bool force_consistency_checks; - bool allow_ingest_behind; - - bool preserve_deletes; + std::shared_ptr + memtable_insert_with_hint_prefix_extractor; - // A vector of EventListeners which callback functions will be called - // when specific RocksDB event happens. - std::vector> listeners; + std::vector cf_paths; - std::shared_ptr row_cache; + std::shared_ptr compaction_thread_limiter; - const SliceTransform* memtable_insert_with_hint_prefix_extractor; + std::shared_ptr sst_partitioner_factory; +}; - std::vector cf_paths; +struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions { + explicit ImmutableOptions(); + explicit ImmutableOptions(const Options& options); - std::shared_ptr compaction_thread_limiter; + ImmutableOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); - FileChecksumGenFactory* file_checksum_gen_factory; + ImmutableOptions(const ImmutableDBOptions& db_options, + const ImmutableCFOptions& cf_options); - std::shared_ptr sst_partitioner_factory; + ImmutableOptions(const DBOptions& db_options, + const ImmutableCFOptions& cf_options); - bool allow_data_in_errors; + ImmutableOptions(const ImmutableDBOptions& db_options, + const ColumnFamilyOptions& cf_options); }; struct MutableCFOptions { @@ -163,6 +138,9 @@ struct MutableCFOptions { min_blob_size(options.min_blob_size), blob_file_size(options.blob_file_size), blob_compression_type(options.blob_compression_type), + enable_blob_garbage_collection(options.enable_blob_garbage_collection), + blob_garbage_collection_age_cutoff( + options.blob_garbage_collection_age_cutoff), max_sequential_skip_in_iterations( options.max_sequential_skip_in_iterations), check_flush_compaction_key_order( @@ -173,6 +151,7 @@ struct MutableCFOptions { bottommost_compression(options.bottommost_compression), compression_opts(options.compression_opts), bottommost_compression_opts(options.bottommost_compression_opts), + bottommost_temperature(options.bottommost_temperature), sample_for_compression( options.sample_for_compression) { // TODO: is 0 fine here? RefreshDerivedOptions(options.num_levels, options.compaction_style); @@ -206,12 +185,15 @@ struct MutableCFOptions { min_blob_size(0), blob_file_size(0), blob_compression_type(kNoCompression), + enable_blob_garbage_collection(false), + blob_garbage_collection_age_cutoff(0.0), max_sequential_skip_in_iterations(0), check_flush_compaction_key_order(true), paranoid_file_checks(false), report_bg_io_stats(false), compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), bottommost_compression(kDisableCompressionOption), + bottommost_temperature(Temperature::kUnknown), sample_for_compression(0) {} explicit MutableCFOptions(const Options& options); @@ -267,6 +249,8 @@ struct MutableCFOptions { uint64_t min_blob_size; uint64_t blob_file_size; CompressionType blob_compression_type; + bool enable_blob_garbage_collection; + double blob_garbage_collection_age_cutoff; // Misc options uint64_t max_sequential_skip_in_iterations; @@ -277,6 +261,9 @@ struct MutableCFOptions { CompressionType bottommost_compression; CompressionOptions compression_opts; CompressionOptions bottommost_compression_opts; + // TODO this experimental option isn't made configurable + // through strings yet. + Temperature bottommost_temperature; uint64_t sample_for_compression; @@ -296,4 +283,15 @@ uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options, // `pin_l0_filter_and_index_blocks_in_cache` is set. size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options); +#ifndef ROCKSDB_LITE +Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, + const MutableCFOptions& mutable_opts, + std::string* opt_string); + +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + Logger* info_log, MutableCFOptions* new_options); +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff --git a/options/configurable.cc b/options/configurable.cc index 8c11b0b0edf..f425f193ce8 100644 --- a/options/configurable.cc +++ b/options/configurable.cc @@ -8,6 +8,7 @@ #include "logging/logging.h" #include "options/configurable_helper.h" #include "options/options_helper.h" +#include "rocksdb/customizable.h" #include "rocksdb/status.h" #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_type.h" @@ -16,10 +17,10 @@ namespace ROCKSDB_NAMESPACE { -void ConfigurableHelper::RegisterOptions( - Configurable& configurable, const std::string& name, void* opt_ptr, +void Configurable::RegisterOptions( + const std::string& name, void* opt_ptr, const std::unordered_map* type_map) { - Configurable::RegisteredOptions opts; + RegisteredOptions opts; opts.name = name; #ifndef ROCKSDB_LITE opts.type_map = type_map; @@ -27,7 +28,7 @@ void ConfigurableHelper::RegisterOptions( (void)type_map; #endif // ROCKSDB_LITE opts.opt_ptr = opt_ptr; - configurable.options_.emplace_back(opts); + options_.emplace_back(opts); } //************************************************************************* @@ -57,13 +58,9 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) { } } } +#else + (void)opts; #endif // ROCKSDB_LITE - if (status.ok()) { - auto inner = Inner(); - if (inner != nullptr) { - status = inner->PrepareOptions(opts); - } - } if (status.ok()) { prepared_ = true; } @@ -94,13 +91,10 @@ Status Configurable::ValidateOptions(const DBOptions& db_opts, } } } +#else + (void)db_opts; + (void)cf_opts; #endif // ROCKSDB_LITE - if (status.ok()) { - const auto inner = Inner(); - if (inner != nullptr) { - status = inner->ValidateOptions(db_opts, cf_opts); - } - } return status; } @@ -116,12 +110,7 @@ const void* Configurable::GetOptionsPtr(const std::string& name) const { return o.opt_ptr; } } - auto inner = Inner(); - if (inner != nullptr) { - return inner->GetOptionsPtr(name); - } else { - return nullptr; - } + return nullptr; } std::string Configurable::GetOptionName(const std::string& opt_name) const { @@ -172,7 +161,10 @@ Status Configurable::ConfigureOptions( #ifndef ROCKSDB_LITE if (!config_options.ignore_unknown_options) { // If we are not ignoring unused, get the defaults in case we need to reset - GetOptionString(config_options, &curr_opts).PermitUncheckedError(); + ConfigOptions copy = config_options; + copy.depth = ConfigOptions::kDepthDetailed; + copy.delimiter = "; "; + GetOptionString(copy, &curr_opts).PermitUncheckedError(); } #endif // ROCKSDB_LITE Status s = ConfigurableHelper::ConfigureOptions(config_options, *this, @@ -234,9 +226,8 @@ Status Configurable::ConfigureFromString(const ConfigOptions& config_options, Status Configurable::ConfigureOption(const ConfigOptions& config_options, const std::string& name, const std::string& value) { - const std::string& opt_name = GetOptionName(name); - return ConfigurableHelper::ConfigureSingleOption(config_options, *this, - opt_name, value); + return ConfigurableHelper::ConfigureSingleOption(config_options, *this, name, + value); } /** @@ -250,9 +241,16 @@ Status Configurable::ParseOption(const ConfigOptions& config_options, const OptionTypeInfo& opt_info, const std::string& opt_name, const std::string& opt_value, void* opt_ptr) { - if (opt_info.IsMutable() || opt_info.IsConfigurable()) { - return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); - } else if (prepared_) { + if (opt_info.IsMutable()) { + if (config_options.mutable_options_only) { + // This option is mutable. Treat all of its children as mutable as well + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + return opt_info.Parse(copy, opt_name, opt_value, opt_ptr); + } else { + return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); + } + } else if (config_options.mutable_options_only) { return Status::InvalidArgument("Option not changeable: " + opt_name); } else { return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr); @@ -375,15 +373,91 @@ Status ConfigurableHelper::ConfigureSomeOptions( Status ConfigurableHelper::ConfigureSingleOption( const ConfigOptions& config_options, Configurable& configurable, const std::string& name, const std::string& value) { - std::string opt_name; + const std::string& opt_name = configurable.GetOptionName(name); + std::string elem_name; void* opt_ptr = nullptr; const auto opt_info = - FindOption(configurable.options_, name, &opt_name, &opt_ptr); + FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr); if (opt_info == nullptr) { return Status::NotFound("Could not find option: ", name); } else { - return ConfigureOption(config_options, configurable, *opt_info, name, - opt_name, value, opt_ptr); + return ConfigureOption(config_options, configurable, *opt_info, opt_name, + elem_name, value, opt_ptr); + } +} +Status ConfigurableHelper::ConfigureCustomizableOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr) { + Customizable* custom = opt_info.AsRawPointer(opt_ptr); + ConfigOptions copy = config_options; + if (opt_info.IsMutable()) { + // This option is mutable. Pass that property on to any subsequent calls + copy.mutable_options_only = false; + } + + if (opt_info.IsMutable() || !config_options.mutable_options_only) { + // Either the option is mutable, or we are processing all of the options + if (opt_name == name || + EndsWith(opt_name, ConfigurableHelper::kIdPropSuffix) || + name == ConfigurableHelper::kIdPropName) { + return configurable.ParseOption(copy, opt_info, opt_name, value, opt_ptr); + } else if (value.empty()) { + return Status::OK(); + } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) { + return configurable.ParseOption(copy, opt_info, name, value, opt_ptr); + } else if (value.find("=") != std::string::npos) { + return custom->ConfigureFromString(copy, value); + } else { + return custom->ConfigureOption(copy, name, value); + } + } else { + // We are processing immutable options, which means that we cannot change + // the Customizable object itself, but could change its mutable properties. + // Check to make sure that nothing is trying to change the Customizable + if (custom == nullptr) { + // We do not have a Customizable to configure. This is OK if the + // value is empty (nothing being configured) but an error otherwise + if (value.empty()) { + return Status::OK(); + } else { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } + } else if (EndsWith(opt_name, ConfigurableHelper::kIdPropSuffix) || + name == ConfigurableHelper::kIdPropName) { + // We have a property of the form "id=value" or "table.id=value" + // This is OK if we ID/value matches the current customizable object + if (custom->GetId() == value) { + return Status::OK(); + } else { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } + } else if (opt_name == name) { + // The properties are of one of forms: + // name = { id = id; prop1 = value1; ... } + // name = { prop1=value1; prop2=value2; ... } + // name = ID + // Convert the value to a map and extract the ID + // If the ID does not match that of the current customizable, return an + // error. Otherwise, update the current customizable via the properties + // map + std::unordered_map props; + std::string id; + Status s = GetOptionsMap(value, custom->GetId(), &id, &props); + if (!s.ok()) { + return s; + } else if (custom->GetId() != id) { + return Status::InvalidArgument("Option not changeable: " + opt_name); + } else if (props.empty()) { + return Status::OK(); + } else { + return custom->ConfigureFromMap(copy, props); + } + } else { + // Attempting to configure one of the properties of the customizable + // Let it through + return custom->ConfigureOption(copy, name, value); + } } } @@ -391,7 +465,10 @@ Status ConfigurableHelper::ConfigureOption( const ConfigOptions& config_options, Configurable& configurable, const OptionTypeInfo& opt_info, const std::string& opt_name, const std::string& name, const std::string& value, void* opt_ptr) { - if (opt_name == name) { + if (opt_info.IsCustomizable()) { + return ConfigureCustomizableOption(config_options, configurable, opt_info, + opt_name, name, value, opt_ptr); + } else if (opt_name == name) { return configurable.ParseOption(config_options, opt_info, opt_name, value, opt_ptr); } else if (opt_info.IsStruct() || opt_info.IsConfigurable()) { @@ -403,6 +480,32 @@ Status ConfigurableHelper::ConfigureOption( } #endif // ROCKSDB_LITE +Status ConfigurableHelper::ConfigureNewObject( + const ConfigOptions& config_options_in, Configurable* object, + const std::string& id, const std::string& base_opts, + const std::unordered_map& opts) { + if (object != nullptr) { + ConfigOptions config_options = config_options_in; + config_options.invoke_prepare_options = false; + if (!base_opts.empty()) { +#ifndef ROCKSDB_LITE + // Don't run prepare options on the base, as we would do that on the + // overlay opts instead + Status status = object->ConfigureFromString(config_options, base_opts); + if (!status.ok()) { + return status; + } +#endif // ROCKSDB_LITE + } + if (!opts.empty()) { + return object->ConfigureFromMap(config_options, opts); + } + } else if (!opts.empty()) { // No object but no map. This is OK + return Status::InvalidArgument("Cannot configure null object ", id); + } + return Status::OK(); +} + //******************************************************************************* // // Methods for Converting Options into strings @@ -489,8 +592,25 @@ Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options, const auto& opt_info = map_iter.second; if (opt_info.ShouldSerialize()) { std::string value; - Status s = opt_info.Serialize(config_options, prefix + opt_name, - opt_iter.opt_ptr, &value); + Status s; + if (!config_options.mutable_options_only) { + s = opt_info.Serialize(config_options, prefix + opt_name, + opt_iter.opt_ptr, &value); + } else if (opt_info.IsMutable()) { + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr, + &value); + } else if (opt_info.IsConfigurable()) { + // If it is a Configurable and we are either printing all of the + // details or not printing only the name, this option should be + // included in the list + if (config_options.IsDetailed() || + !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) { + s = opt_info.Serialize(config_options, prefix + opt_name, + opt_iter.opt_ptr, &value); + } + } if (!s.ok()) { return s; } else if (!value.empty()) { @@ -519,7 +639,7 @@ Status Configurable::GetOptionNames( } Status ConfigurableHelper::ListOptions( - const ConfigOptions& /*config_options*/, const Configurable& configurable, + const ConfigOptions& config_options, const Configurable& configurable, const std::string& prefix, std::unordered_set* result) { Status status; for (auto const& opt_iter : configurable.options_) { @@ -529,7 +649,11 @@ Status ConfigurableHelper::ListOptions( // If the option is no longer used in rocksdb and marked as deprecated, // we skip it in the serialization. if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) { - result->emplace(prefix + opt_name); + if (!config_options.mutable_options_only) { + result->emplace(prefix + opt_name); + } else if (opt_info.IsMutable()) { + result->emplace(prefix + opt_name); + } } } } @@ -594,11 +718,23 @@ bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options, return false; } else { for (const auto& map_iter : *(o.type_map)) { - if (config_options.IsCheckEnabled(map_iter.second.GetSanityLevel()) && - !this_one.OptionsAreEqual(config_options, map_iter.second, - map_iter.first, this_offset, - that_offset, mismatch)) { - return false; + const auto& opt_info = map_iter.second; + if (config_options.IsCheckEnabled(opt_info.GetSanityLevel())) { + if (!config_options.mutable_options_only) { + if (!this_one.OptionsAreEqual(config_options, opt_info, + map_iter.first, this_offset, + that_offset, mismatch)) { + return false; + } + } else if (opt_info.IsMutable()) { + ConfigOptions copy = config_options; + copy.mutable_options_only = false; + if (!this_one.OptionsAreEqual(copy, opt_info, map_iter.first, + this_offset, that_offset, + mismatch)) { + return false; + } + } } } } @@ -607,4 +743,47 @@ bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options, return true; } #endif // ROCKSDB_LITE + +Status ConfigurableHelper::GetOptionsMap( + const std::string& value, const Customizable* customizable, std::string* id, + std::unordered_map* props) { + if (customizable != nullptr) { + return GetOptionsMap(value, customizable->GetId(), id, props); + } else { + return GetOptionsMap(value, "", id, props); + } +} + +Status ConfigurableHelper::GetOptionsMap( + const std::string& value, const std::string& default_id, std::string* id, + std::unordered_map* props) { + assert(id); + assert(props); + Status status; + if (value.empty() || value == kNullptrString) { + *id = default_id; + } else if (value.find('=') == std::string::npos) { + *id = value; +#ifndef ROCKSDB_LITE + } else { + status = StringToMap(value, props); + if (status.ok()) { + auto iter = props->find(ConfigurableHelper::kIdPropName); + if (iter != props->end()) { + *id = iter->second; + props->erase(iter); + } else if (default_id.empty()) { // Should this be an error?? + status = Status::InvalidArgument("Name property is missing"); + } else { + *id = default_id; + } + } +#else + } else { + *id = value; + props->clear(); +#endif + } + return status; +} } // namespace ROCKSDB_NAMESPACE diff --git a/options/configurable_helper.h b/options/configurable_helper.h index 6a2454727e2..b822b0b8eec 100644 --- a/options/configurable_helper.h +++ b/options/configurable_helper.h @@ -20,35 +20,8 @@ namespace ROCKSDB_NAMESPACE { // of configuring the objects. class ConfigurableHelper { public: - // Registers the input name with the options and associated map. - // When classes register their options in this manner, most of the - // functionality (excluding unknown options and validate/prepare) is - // implemented by the base class. - // - // This method should be called in the class constructor to register the - // option set for this object. For example, to register the options - // associated with the BlockBasedTableFactory, the constructor calls this - // method passing in: - // - the name of the options ("BlockBasedTableOptions"); - // - the options object (the BlockBasedTableOptions object for this object; - // - the options type map for the BlockBasedTableOptions. - // This registration allows the Configurable class to process the option - // values associated with the BlockBasedTableOptions without further code in - // the derived class. - // - // @param name The name of this set of options (@see GetOptionsPtr) - // @param opt_ptr Pointer to the options to associate with this name - // @param opt_map Options map that controls how this option is configured. - template - static void RegisterOptions( - Configurable& configurable, T* opt_ptr, - const std::unordered_map* opt_map) { - RegisterOptions(configurable, T::kName(), opt_ptr, opt_map); - } - static void RegisterOptions( - Configurable& configurable, const std::string& name, void* opt_ptr, - const std::unordered_map* opt_map); - + constexpr static const char* kIdPropName = "id"; + constexpr static const char* kIdPropSuffix = ".id"; // Configures the input Configurable object based on the parameters. // On successful completion, the Configurable is updated with the settings // from the opt_map. @@ -75,6 +48,43 @@ class ConfigurableHelper { const std::unordered_map& options, std::unordered_map* unused); + // Helper method for configuring a new customizable object. + // If base_opts are set, this is the "default" options to use for the new + // object. Then any values in "new_opts" are applied to the object. + // Returns OK if the object could be successfully configured + // @return NotFound If any of the names in the base or new opts were not valid + // for this object. + // @return NotSupported If any of the names are valid but the object does + // not know how to convert the value. This can happen if, for example, + // there is some nested Configurable that cannot be created. + // @return InvalidArgument If any of the values cannot be successfully + // parsed. + static Status ConfigureNewObject( + const ConfigOptions& config_options, Configurable* object, + const std::string& id, const std::string& base_opts, + const std::unordered_map& new_opts); + + // Splits the input opt_value into the ID field and the remaining options. + // The input opt_value can be in the form of "name" or "name=value + // [;name=value]". The first form uses the "name" as an id with no options The + // latter form converts the input into a map of name=value pairs and sets "id" + // to the "id" value from the map. + // @param opt_value The value to split into id and options + // @param id The id field from the opt_value + // @param options The remaining name/value pairs from the opt_value + // @param default_id If specified and there is no id field in the map, this + // value is returned as the ID + // @return OK if the value was converted to a map succesfully and an ID was + // found. + // @return InvalidArgument if the value could not be converted to a map or + // there was or there is no id property in the map. + static Status GetOptionsMap( + const std::string& opt_value, const Customizable* custom, std::string* id, + std::unordered_map* options); + static Status GetOptionsMap( + const std::string& opt_value, const std::string& default_id, + std::string* id, std::unordered_map* options); + #ifndef ROCKSDB_LITE // Internal method to configure a set of options for this object. // Classes may override this value to change its behavior. @@ -205,6 +215,11 @@ class ConfigurableHelper { static const OptionTypeInfo* FindOption( const std::vector& options, const std::string& name, std::string* opt_name, void** opt_ptr); + + static Status ConfigureCustomizableOption( + const ConfigOptions& config_options, Configurable& configurable, + const OptionTypeInfo& opt_info, const std::string& opt_name, + const std::string& name, const std::string& value, void* opt_ptr); #endif // ROCKSDB_LITE }; diff --git a/options/configurable_test.cc b/options/configurable_test.cc index 27f8775269b..5983e2dc614 100644 --- a/options/configurable_test.cc +++ b/options/configurable_test.cc @@ -45,6 +45,22 @@ class StringLogger : public Logger { private: std::string string_; }; +static std::unordered_map struct_option_info = { +#ifndef ROCKSDB_LITE + {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kMutable)}, +#endif // ROCKSDB_LITE +}; + +static std::unordered_map imm_struct_option_info = + { +#ifndef ROCKSDB_LITE + {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; class SimpleConfigurable : public TestConfigurable { public: @@ -62,46 +78,20 @@ class SimpleConfigurable : public TestConfigurable { : TestConfigurable(name, mode, map) { if ((mode & TestConfigMode::kUniqueMode) != 0) { unique_.reset(SimpleConfigurable::Create("Unique" + name_)); - ConfigurableHelper::RegisterOptions(*this, name_ + "Unique", &unique_, - &unique_option_info); + RegisterOptions(name_ + "Unique", &unique_, &unique_option_info); } if ((mode & TestConfigMode::kSharedMode) != 0) { shared_.reset(SimpleConfigurable::Create("Shared" + name_)); - ConfigurableHelper::RegisterOptions(*this, name_ + "Shared", &shared_, - &shared_option_info); + RegisterOptions(name_ + "Shared", &shared_, &shared_option_info); } if ((mode & TestConfigMode::kRawPtrMode) != 0) { pointer_ = SimpleConfigurable::Create("Pointer" + name_); - ConfigurableHelper::RegisterOptions(*this, name_ + "Pointer", &pointer_, - &pointer_option_info); + RegisterOptions(name_ + "Pointer", &pointer_, &pointer_option_info); } } }; // End class SimpleConfigurable -static std::unordered_map wrapped_option_info = { -#ifndef ROCKSDB_LITE - {"inner", - {0, OptionType::kConfigurable, OptionVerificationType::kNormal, - OptionTypeFlags::kShared}}, -#endif // ROCKSDB_LITE -}; -class WrappedConfigurable : public SimpleConfigurable { - public: - WrappedConfigurable(const std::string& name, unsigned char mode, - const std::shared_ptr& t) - : SimpleConfigurable(name, mode, &simple_option_info), inner_(t) { - ConfigurableHelper::RegisterOptions(*this, "WrappedOptions", &inner_, - &wrapped_option_info); - } - - protected: - Configurable* Inner() const override { return inner_.get(); } - - private: - std::shared_ptr inner_; -}; - using ConfigTestFactoryFunc = std::function; class ConfigurableTest : public testing::Test { @@ -257,19 +247,15 @@ class ValidatedConfigurable : public SimpleConfigurable { : SimpleConfigurable(name, TestConfigMode::kDefaultMode), validated(false), prepared(0) { - ConfigurableHelper::RegisterOptions(*this, "Validated", &validated, - &validated_option_info); - ConfigurableHelper::RegisterOptions(*this, "Prepared", &prepared, - &prepared_option_info); + RegisterOptions("Validated", &validated, &validated_option_info); + RegisterOptions("Prepared", &prepared, &prepared_option_info); if ((mode & TestConfigMode::kUniqueMode) != 0) { unique_.reset(new ValidatedConfigurable( "Unique" + name_, TestConfigMode::kDefaultMode, false)); if (dont_prepare) { - ConfigurableHelper::RegisterOptions(*this, name_ + "Unique", &unique_, - &dont_prepare_option_info); + RegisterOptions(name_ + "Unique", &unique_, &dont_prepare_option_info); } else { - ConfigurableHelper::RegisterOptions(*this, name_ + "Unique", &unique_, - &unique_option_info); + RegisterOptions(name_ + "Unique", &unique_, &unique_option_info); } } } @@ -345,6 +331,69 @@ TEST_F(ConfigurableTest, PrepareOptionsTest) { ASSERT_EQ(*up, 0); } +TEST_F(ConfigurableTest, MutableOptionsTest) { + static std::unordered_map imm_option_info = { +#ifndef ROCKSDB_LITE + {"imm", OptionTypeInfo::Struct("imm", &simple_option_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE + }; + + class MutableConfigurable : public SimpleConfigurable { + public: + MutableConfigurable() + : SimpleConfigurable("mutable", TestConfigMode::kDefaultMode | + TestConfigMode::kUniqueMode | + TestConfigMode::kSharedMode) { + RegisterOptions("struct", &options_, &struct_option_info); + RegisterOptions("imm", &options_, &imm_option_info); + } + }; + MutableConfigurable mc; + ConfigOptions options = config_options_; + + ASSERT_OK(mc.ConfigureOption(options, "bool", "true")); + ASSERT_OK(mc.ConfigureOption(options, "int", "42")); + auto* opts = mc.GetOptions("mutable"); + ASSERT_NE(opts, nullptr); + ASSERT_EQ(opts->i, 42); + ASSERT_EQ(opts->b, true); + ASSERT_OK(mc.ConfigureOption(options, "struct", "{bool=false;}")); + ASSERT_OK(mc.ConfigureOption(options, "imm", "{int=55;}")); + + options.mutable_options_only = true; + + // Now only mutable options should be settable. + ASSERT_NOK(mc.ConfigureOption(options, "bool", "true")); + ASSERT_OK(mc.ConfigureOption(options, "int", "24")); + ASSERT_EQ(opts->i, 24); + ASSERT_EQ(opts->b, false); + ASSERT_NOK(mc.ConfigureFromString(options, "bool=false;int=33;")); + ASSERT_EQ(opts->i, 24); + ASSERT_EQ(opts->b, false); + + // Setting options through an immutable struct fails + ASSERT_NOK(mc.ConfigureOption(options, "imm", "{int=55;}")); + ASSERT_NOK(mc.ConfigureOption(options, "imm.int", "55")); + ASSERT_EQ(opts->i, 24); + ASSERT_EQ(opts->b, false); + + // Setting options through an mutable struct succeeds + ASSERT_OK(mc.ConfigureOption(options, "struct", "{int=44;}")); + ASSERT_EQ(opts->i, 44); + ASSERT_OK(mc.ConfigureOption(options, "struct.int", "55")); + ASSERT_EQ(opts->i, 55); + + // Setting nested immutable configurable options fail + ASSERT_NOK(mc.ConfigureOption(options, "shared", "{bool=true;}")); + ASSERT_NOK(mc.ConfigureOption(options, "shared.bool", "true")); + + // Setting nested mutable configurable options succeeds + ASSERT_OK(mc.ConfigureOption(options, "unique", "{bool=true}")); + ASSERT_OK(mc.ConfigureOption(options, "unique.bool", "true")); +} + TEST_F(ConfigurableTest, DeprecatedOptionsTest) { static std::unordered_map deprecated_option_info = { @@ -476,13 +525,6 @@ TEST_F(ConfigurableTest, MatchesTest) { } static Configurable* SimpleStructFactory() { - static std::unordered_map struct_option_info = { -#ifndef ROCKSDB_LITE - {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, - OptionVerificationType::kNormal, - OptionTypeFlags::kMutable)}, -#endif // ROCKSDB_LITE - }; return SimpleConfigurable::Create( "simple-struct", TestConfigMode::kDefaultMode, &struct_option_info); } @@ -607,17 +649,6 @@ static std::unordered_map TestFactories = { TestConfigMode::kSimpleMode | TestConfigMode::kNestedMode); }}, - {"ThreeWay", - []() { - std::shared_ptr child; - child.reset( - SimpleConfigurable::Create("child", TestConfigMode::kDefaultMode)); - std::shared_ptr parent; - parent.reset(new WrappedConfigurable( - "parent", TestConfigMode::kDefaultMode, child)); - return new WrappedConfigurable("master", TestConfigMode::kDefaultMode, - parent); - }}, {"ThreeDeep", []() { Configurable* simple = SimpleConfigurable::Create( @@ -765,10 +796,6 @@ INSTANTIATE_TEST_CASE_P( "pointer={int=22;string=pointer};" "unique={int=33;string=unique};" "shared={int=44;string=shared}"), - std::pair("ThreeWay", - "int=11;bool=true;string=outer;" - "inner={int=22;string=parent;" - "inner={int=33;string=child}};"), std::pair("ThreeDeep", "int=11;bool=true;string=outer;" "unique={int=22;string=inner;" diff --git a/options/configurable_test.h b/options/configurable_test.h index 52c3599f667..cf9d0667837 100644 --- a/options/configurable_test.h +++ b/options/configurable_test.h @@ -112,11 +112,10 @@ class TestConfigurable : public Configurable { : name_(name), pointer_(nullptr) { prefix_ = "test." + name + "."; if ((mode & TestConfigMode::kSimpleMode) != 0) { - ConfigurableHelper::RegisterOptions(*this, name_, &options_, map); + RegisterOptions(name_, &options_, map); } if ((mode & TestConfigMode::kEnumMode) != 0) { - ConfigurableHelper::RegisterOptions(*this, name_ + "Enum", &options_, - &enum_option_info); + RegisterOptions(name_ + "Enum", &options_, &enum_option_info); } } diff --git a/options/customizable.cc b/options/customizable.cc new file mode 100644 index 00000000000..3488f326bac --- /dev/null +++ b/options/customizable.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/customizable.h" + +#include "options/configurable_helper.h" +#include "rocksdb/convenience.h" +#include "rocksdb/status.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +std::string Customizable::GetOptionName(const std::string& long_name) const { + const std::string& name = Name(); + size_t name_len = name.size(); + if (long_name.size() > name_len + 1 && + long_name.compare(0, name_len, name) == 0 && + long_name.at(name_len) == '.') { + return long_name.substr(name_len + 1); + } else { + return Configurable::GetOptionName(long_name); + } +} + +#ifndef ROCKSDB_LITE +Status Customizable::GetOption(const ConfigOptions& config_options, + const std::string& opt_name, + std::string* value) const { + if (opt_name == ConfigurableHelper::kIdPropName) { + *value = GetId(); + return Status::OK(); + } else { + return Configurable::GetOption(config_options, opt_name, value); + } +} + +std::string Customizable::SerializeOptions(const ConfigOptions& config_options, + const std::string& prefix) const { + std::string result; + std::string parent; + if (!config_options.IsShallow()) { + parent = Configurable::SerializeOptions(config_options, ""); + } + if (parent.empty()) { + result = GetId(); + } else { + result.append(prefix + ConfigurableHelper::kIdPropName + "=" + GetId() + + config_options.delimiter); + result.append(parent); + } + return result; +} + +#endif // ROCKSDB_LITE + +bool Customizable::AreEquivalent(const ConfigOptions& config_options, + const Configurable* other, + std::string* mismatch) const { + if (config_options.sanity_level > ConfigOptions::kSanityLevelNone && + this != other) { + const Customizable* custom = reinterpret_cast(other); + if (GetId() != custom->GetId()) { + *mismatch = ConfigurableHelper::kIdPropName; + return false; + } else if (config_options.sanity_level > + ConfigOptions::kSanityLevelLooselyCompatible) { + bool matches = + Configurable::AreEquivalent(config_options, other, mismatch); + return matches; + } + } + return true; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/options/customizable_helper.h b/options/customizable_helper.h new file mode 100644 index 00000000000..cd7cc26f830 --- /dev/null +++ b/options/customizable_helper.h @@ -0,0 +1,221 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include + +#include "options/configurable_helper.h" +#include "rocksdb/convenience.h" +#include "rocksdb/customizable.h" +#include "rocksdb/status.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { +template +using SharedFactoryFunc = + std::function*)>; + +template +using UniqueFactoryFunc = + std::function*)>; + +template +using StaticFactoryFunc = std::function; + +// Creates a new shared Customizable object based on the input parameters. +// This method parses the input value to determine the type of instance to +// create. If there is an existing instance (in result) and it is the same type +// as the object being created, the existing configuration is stored and used as +// the default for the new object. +// +// The value parameter specified the instance class of the object to create. +// If it is a simple string (e.g. BlockBasedTable), then the instance will be +// created using the default settings. If the value is a set of name-value +// pairs, then the "id" value is used to determine the instance to create and +// the remaining parameters are used to configure the object. Id name-value +// pairs are specified, there must be an "id=value" pairing or an error will +// result. +// +// The config_options parameter controls the process and how errors are +// returned. If ignore_unknown_options=true, unknown values are ignored during +// the configuration If ignore_unsupported_options=true, unknown instance types +// are ignored If invoke_prepare_options=true, the resulting instance wll be +// initialized (via PrepareOptions +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to +// create and initailzie the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadSharedObject(const ConfigOptions& config_options, + const std::string& value, + const SharedFactoryFunc& func, + std::shared_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = + ConfigurableHelper::GetOptionsMap(value, result->get(), &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + std::string curr_opts; +#ifndef ROCKSDB_LITE + if (result->get() != nullptr && result->get()->GetId() == id) { + // Try to get the existing options, ignoring any errors + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + result->get()->GetOptionString(embedded, &curr_opts).PermitUncheckedError(); + } +#endif + if (func == nullptr || !func(id, result)) { // No factory, or it failed + if (value.empty()) { + // No Id and no options. Clear the object + result->reset(); + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { +#ifndef ROCKSDB_LITE + status = config_options.registry->NewSharedObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif + if (!status.ok()) { + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + return Status::OK(); + } else { + return status; + } + } + } + } + return ConfigurableHelper::ConfigureNewObject(config_options, result->get(), + id, curr_opts, opt_map); +} + +// Creates a new unique customizable instance object based on the input +// parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to +// create and initailzie the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadUniqueObject(const ConfigOptions& config_options, + const std::string& value, + const UniqueFactoryFunc& func, + std::unique_ptr* result) { + std::string id; + std::unordered_map opt_map; + Status status = + ConfigurableHelper::GetOptionsMap(value, result->get(), &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + std::string curr_opts; +#ifndef ROCKSDB_LITE + if (result->get() != nullptr && result->get()->GetId() == id) { + // Try to get the existing options, ignoring any errors + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + result->get()->GetOptionString(embedded, &curr_opts).PermitUncheckedError(); + } +#endif + if (func == nullptr || !func(id, result)) { // No factory, or it failed + if (value.empty()) { + // No Id and no options. Clear the object + result->reset(); + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { +#ifndef ROCKSDB_LITE + status = config_options.registry->NewUniqueObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (!status.ok()) { + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + return Status::OK(); + } else { + return status; + } + } + } + } + return ConfigurableHelper::ConfigureNewObject(config_options, result->get(), + id, curr_opts, opt_map); +} +// Creates a new static (raw pointer) customizable instance object based on the +// input parameters. +// @see LoadSharedObject for more information on the inner workings of this +// method. +// +// @param config_options Controls how the instance is created and errors are +// handled +// @param value Either the simple name of the instance to create, or a set of +// name-value pairs to +// create and initailzie the object +// @param func Optional function to call to attempt to create an instance +// @param result The newly created instance. +template +static Status LoadStaticObject(const ConfigOptions& config_options, + const std::string& value, + const StaticFactoryFunc& func, T** result) { + std::string id; + std::unordered_map opt_map; + Status status = + ConfigurableHelper::GetOptionsMap(value, *result, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + std::string curr_opts; +#ifndef ROCKSDB_LITE + if (*result != nullptr && (*result)->GetId() == id) { + // Try to get the existing options, ignoring any errors + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + (*result)->GetOptionString(embedded, &curr_opts).PermitUncheckedError(); + } +#endif + if (func == nullptr || !func(id, result)) { // No factory, or it failed + if (value.empty()) { + // No Id and no options. Clear the object + *result = nullptr; + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { +#ifndef ROCKSDB_LITE + status = config_options.registry->NewStaticObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (!status.ok()) { + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + return Status::OK(); + } else { + return status; + } + } + } + } + return ConfigurableHelper::ConfigureNewObject(config_options, *result, id, + curr_opts, opt_map); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/options/customizable_test.cc b/options/customizable_test.cc new file mode 100644 index 00000000000..d48ed104034 --- /dev/null +++ b/options/customizable_test.cc @@ -0,0 +1,831 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/customizable.h" + +#include +#include +#include +#include + +#include "options/configurable_helper.h" +#include "options/customizable_helper.h" +#include "options/options_helper.h" +#include "options/options_parser.h" +#include "rocksdb/convenience.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "table/mock_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +#ifndef GFLAGS +bool FLAGS_enable_print = false; +#else +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS + +namespace ROCKSDB_NAMESPACE { +class StringLogger : public Logger { + public: + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + char buffer[1000]; + vsnprintf(buffer, sizeof(buffer), format, ap); + string_.append(buffer); + } + const std::string& str() const { return string_; } + void clear() { string_.clear(); } + + private: + std::string string_; +}; + +class TestCustomizable : public Customizable { + public: + TestCustomizable(const std::string& name) : name_(name) {} + // Method to allow CheckedCast to work for this class + static const char* kClassName() { + return "TestCustomizable"; + ; + } + + const char* Name() const override { return name_.c_str(); } + static const char* Type() { return "test.custom"; } + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::unique_ptr* result); + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + std::shared_ptr* result); + static Status CreateFromString(const ConfigOptions& opts, + const std::string& value, + TestCustomizable** result); + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return Customizable::IsInstanceOf(name); + } + } + + protected: + const std::string name_; +}; + +struct AOptions { + int i = 0; + bool b = false; +}; + +static std::unordered_map a_option_info = { +#ifndef ROCKSDB_LITE + {"int", + {offsetof(struct AOptions, i), OptionType::kInt, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bool", + {offsetof(struct AOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; +class ACustomizable : public TestCustomizable { + public: + explicit ACustomizable(const std::string& id) + : TestCustomizable("A"), id_(id) { + RegisterOptions("A", &opts_, &a_option_info); + } + std::string GetId() const override { return id_; } + static const char* kClassName() { return "A"; } + + private: + AOptions opts_; + const std::string id_; +}; + +#ifndef ROCKSDB_LITE +static int A_count = 0; +const FactoryFunc& a_func = + ObjectLibrary::Default()->Register( + "A.*", + [](const std::string& name, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new ACustomizable(name)); + A_count++; + return guard->get(); + }); +#endif // ROCKSDB_LITE + +struct BOptions { + std::string s; + bool b = false; +}; + +static std::unordered_map b_option_info = { +#ifndef ROCKSDB_LITE + {"string", + {offsetof(struct BOptions, s), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bool", + {offsetof(struct BOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, +#endif // ROCKSDB_LITE +}; + +class BCustomizable : public TestCustomizable { + private: + public: + explicit BCustomizable(const std::string& name) : TestCustomizable(name) { + RegisterOptions(name, &opts_, &b_option_info); + } + static const char* kClassName() { return "B"; } + + private: + BOptions opts_; +}; + +static bool LoadSharedB(const std::string& id, + std::shared_ptr* result) { + if (id == "B") { + result->reset(new BCustomizable(id)); + return true; + } else if (id.empty()) { + result->reset(); + return true; + } else { + return false; + } +} +Status TestCustomizable::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { + return LoadSharedObject(config_options, value, LoadSharedB, + result); +} + +Status TestCustomizable::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::unique_ptr* result) { + return LoadUniqueObject( + config_options, value, + [](const std::string& id, std::unique_ptr* u) { + if (id == "B") { + u->reset(new BCustomizable(id)); + return true; + } else if (id.empty()) { + u->reset(); + return true; + } else { + return false; + } + }, + result); +} + +Status TestCustomizable::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + TestCustomizable** result) { + return LoadStaticObject( + config_options, value, + [](const std::string& id, TestCustomizable** ptr) { + if (id == "B") { + *ptr = new BCustomizable(id); + return true; + } else if (id.empty()) { + *ptr = nullptr; + return true; + } else { + return false; + } + }, + result); +} + +#ifndef ROCKSDB_LITE +const FactoryFunc& s_func = + ObjectLibrary::Default()->Register( + "S", [](const std::string& name, + std::unique_ptr* /* guard */, + std::string* /* msg */) { return new BCustomizable(name); }); +#endif // ROCKSDB_LITE + +struct SimpleOptions { + bool b = true; + bool is_mutable = true; + std::unique_ptr cu; + std::shared_ptr cs; + TestCustomizable* cp = nullptr; +}; + +static std::unordered_map simple_option_info = { +#ifndef ROCKSDB_LITE + {"bool", + {offsetof(struct SimpleOptions, b), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"unique", OptionTypeInfo::AsCustomUniquePtr( + offsetof(struct SimpleOptions, cu), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"shared", OptionTypeInfo::AsCustomSharedPtr( + offsetof(struct SimpleOptions, cs), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"pointer", OptionTypeInfo::AsCustomRawPtr( + offsetof(struct SimpleOptions, cp), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, +#endif // ROCKSDB_LITE +}; + +class SimpleConfigurable : public Configurable { + private: + SimpleOptions simple_; + + public: + SimpleConfigurable() { + RegisterOptions("simple", &simple_, &simple_option_info); + } + + explicit SimpleConfigurable( + const std::unordered_map* map) { + RegisterOptions("simple", &simple_, map); + } + + bool IsPrepared() const override { + if (simple_.is_mutable) { + return false; + } else { + return Configurable::IsPrepared(); + } + } + + private: +}; + +class CustomizableTest : public testing::Test { + public: + ConfigOptions config_options_; +}; + +#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE +// Tests that a Customizable can be created by: +// - a simple name +// - a XXX.id option +// - a property with a name +TEST_F(CustomizableTest, CreateByNameTest) { + ObjectLibrary::Default()->Register( + "TEST.*", + [](const std::string& name, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new TestCustomizable(name)); + return guard->get(); + }); + std::unique_ptr configurable(new SimpleConfigurable()); + SimpleOptions* simple = configurable->GetOptions("simple"); + ASSERT_NE(simple, nullptr); + ASSERT_OK( + configurable->ConfigureFromString(config_options_, "unique={id=TEST_1}")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "TEST_1"); + ASSERT_OK( + configurable->ConfigureFromString(config_options_, "unique.id=TEST_2")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "TEST_2"); + ASSERT_OK( + configurable->ConfigureFromString(config_options_, "unique=TEST_3")); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "TEST_3"); +} + +TEST_F(CustomizableTest, ToStringTest) { + std::unique_ptr custom(new TestCustomizable("test")); + ASSERT_EQ(custom->ToString(config_options_), "test"); +} + +TEST_F(CustomizableTest, SimpleConfigureTest) { + std::unordered_map opt_map = { + {"unique", "id=A;int=1;bool=true"}, + {"shared", "id=B;string=s"}, + }; + std::unique_ptr configurable(new SimpleConfigurable()); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map)); + SimpleOptions* simple = configurable->GetOptions("simple"); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "A"); + std::string opt_str; + std::string mismatch; + ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str)); + std::unique_ptr copy(new SimpleConfigurable()); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE( + configurable->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +static void GetMapFromProperties( + const std::string& props, + std::unordered_map* map) { + std::istringstream iss(props); + std::unordered_map copy_map; + std::string line; + map->clear(); + for (int line_num = 0; std::getline(iss, line); line_num++) { + std::string name; + std::string value; + ASSERT_OK( + RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num)); + (*map)[name] = value; + } +} + +TEST_F(CustomizableTest, ConfigureFromPropsTest) { + std::unordered_map opt_map = { + {"unique.id", "A"}, {"unique.A.int", "1"}, {"unique.A.bool", "true"}, + {"shared.id", "B"}, {"shared.B.string", "s"}, + }; + std::unique_ptr configurable(new SimpleConfigurable()); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map)); + SimpleOptions* simple = configurable->GetOptions("simple"); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "A"); + std::string opt_str; + std::string mismatch; + config_options_.delimiter = "\n"; + std::unordered_map props; + ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str)); + GetMapFromProperties(opt_str, &props); + std::unique_ptr copy(new SimpleConfigurable()); + ASSERT_OK(copy->ConfigureFromMap(config_options_, props)); + ASSERT_TRUE( + configurable->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +TEST_F(CustomizableTest, ConfigureFromShortTest) { + std::unordered_map opt_map = { + {"unique.id", "A"}, {"unique.A.int", "1"}, {"unique.A.bool", "true"}, + {"shared.id", "B"}, {"shared.B.string", "s"}, + }; + std::unique_ptr configurable(new SimpleConfigurable()); + ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map)); + SimpleOptions* simple = configurable->GetOptions("simple"); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), "A"); +} + +TEST_F(CustomizableTest, AreEquivalentOptionsTest) { + std::unordered_map opt_map = { + {"unique", "id=A;int=1;bool=true"}, + {"shared", "id=A;int=1;bool=true"}, + }; + std::string mismatch; + ConfigOptions config_options = config_options_; + config_options.invoke_prepare_options = false; + std::unique_ptr c1(new SimpleConfigurable()); + std::unique_ptr c2(new SimpleConfigurable()); + ASSERT_OK(c1->ConfigureFromMap(config_options, opt_map)); + ASSERT_OK(c2->ConfigureFromMap(config_options, opt_map)); + ASSERT_TRUE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); + SimpleOptions* simple = c1->GetOptions("simple"); + ASSERT_TRUE( + simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch)); + ASSERT_OK(simple->cu->ConfigureOption(config_options, "int", "2")); + ASSERT_FALSE( + simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch)); + ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); + ConfigOptions loosely = config_options; + loosely.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; + ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch)); + ASSERT_TRUE(simple->cu->AreEquivalent(loosely, simple->cs.get(), &mismatch)); + + ASSERT_OK(c1->ConfigureOption(config_options, "shared", "id=B;string=3")); + ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch)); + ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); + ASSERT_FALSE(simple->cs->AreEquivalent(loosely, simple->cu.get(), &mismatch)); + simple->cs.reset(); + ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch)); + ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch)); +} + +// Tests that we can initialize a customizable from its options +TEST_F(CustomizableTest, ConfigureStandaloneCustomTest) { + std::unique_ptr base, copy; + auto registry = ObjectRegistry::NewInstance(); + ASSERT_OK(registry->NewUniqueObject("A", &base)); + ASSERT_OK(registry->NewUniqueObject("A", ©)); + ASSERT_OK(base->ConfigureFromString(config_options_, "int=33;bool=true")); + std::string opt_str; + std::string mismatch; + ASSERT_OK(base->GetOptionString(config_options_, &opt_str)); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); +} + +// Tests that we fail appropriately if the pattern is not registered +TEST_F(CustomizableTest, BadNameTest) { + config_options_.ignore_unsupported_options = false; + std::unique_ptr c1(new SimpleConfigurable()); + ASSERT_NOK( + c1->ConfigureFromString(config_options_, "unique.shared.id=bad name")); + config_options_.ignore_unsupported_options = true; + ASSERT_OK( + c1->ConfigureFromString(config_options_, "unique.shared.id=bad name")); +} + +// Tests that we fail appropriately if a bad option is passed to the underlying +// configurable +TEST_F(CustomizableTest, BadOptionTest) { + std::unique_ptr c1(new SimpleConfigurable()); + ConfigOptions ignore = config_options_; + ignore.ignore_unknown_options = true; + + ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.int=11")); + ASSERT_NOK(c1->ConfigureFromString(config_options_, "shared={id=B;int=1}")); + ASSERT_OK(c1->ConfigureFromString(ignore, "shared={id=A;string=s}")); + ASSERT_NOK(c1->ConfigureFromString(config_options_, "B.int=11")); + ASSERT_OK(c1->ConfigureFromString(ignore, "B.int=11")); + ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.string=s")); + ASSERT_OK(c1->ConfigureFromString(ignore, "A.string=s")); + // Test as detached + ASSERT_NOK( + c1->ConfigureFromString(config_options_, "shared.id=A;A.string=b}")); + ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=A;A.string=s}")); +} + +// Tests that different IDs lead to different objects +TEST_F(CustomizableTest, UniqueIdTest) { + std::unique_ptr base(new SimpleConfigurable()); + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_1;int=1;bool=true}")); + SimpleOptions* simple = base->GetOptions("simple"); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(simple->cu->GetId(), std::string("A_1")); + std::string opt_str; + std::string mismatch; + ASSERT_OK(base->GetOptionString(config_options_, &opt_str)); + std::unique_ptr copy(new SimpleConfigurable()); + ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str)); + ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_2;int=1;bool=true}")); + ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch)); + ASSERT_EQ(simple->cu->GetId(), std::string("A_2")); +} + +TEST_F(CustomizableTest, IsInstanceOfTest) { + std::shared_ptr tc = std::make_shared("A"); + + ASSERT_TRUE(tc->IsInstanceOf("A")); + ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable")); + ASSERT_FALSE(tc->IsInstanceOf("B")); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), nullptr); + + tc.reset(new BCustomizable("B")); + ASSERT_TRUE(tc->IsInstanceOf("B")); + ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable")); + ASSERT_FALSE(tc->IsInstanceOf("A")); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), tc.get()); + ASSERT_EQ(tc->CheckedCast(), nullptr); +} + +static std::unordered_map inner_option_info = { +#ifndef ROCKSDB_LITE + {"inner", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kStringNameOnly)} +#endif // ROCKSDB_LITE +}; + +class InnerCustomizable : public Customizable { + public: + explicit InnerCustomizable(const std::shared_ptr& w) + : inner_(w) {} + static const char* kClassName() { return "Inner"; } + bool IsInstanceOf(const std::string& name) const override { + if (name == kClassName()) { + return true; + } else { + return Customizable::IsInstanceOf(name); + } + } + + protected: + const Customizable* Inner() const override { return inner_.get(); } + + private: + std::shared_ptr inner_; +}; + +class WrappedCustomizable1 : public InnerCustomizable { + public: + explicit WrappedCustomizable1(const std::shared_ptr& w) + : InnerCustomizable(w) {} + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "Wrapped1"; } +}; + +class WrappedCustomizable2 : public InnerCustomizable { + public: + explicit WrappedCustomizable2(const std::shared_ptr& w) + : InnerCustomizable(w) {} + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "Wrapped2"; } +}; + +TEST_F(CustomizableTest, WrappedInnerTest) { + std::shared_ptr ac = + std::make_shared("A"); + + ASSERT_TRUE(ac->IsInstanceOf("A")); + ASSERT_TRUE(ac->IsInstanceOf("TestCustomizable")); + ASSERT_EQ(ac->CheckedCast(), ac.get()); + ASSERT_EQ(ac->CheckedCast(), nullptr); + ASSERT_EQ(ac->CheckedCast(), nullptr); + ASSERT_EQ(ac->CheckedCast(), nullptr); + std::shared_ptr wc1 = + std::make_shared(ac); + + ASSERT_TRUE(wc1->IsInstanceOf(WrappedCustomizable1::kClassName())); + ASSERT_EQ(wc1->CheckedCast(), wc1.get()); + ASSERT_EQ(wc1->CheckedCast(), nullptr); + ASSERT_EQ(wc1->CheckedCast(), wc1.get()); + ASSERT_EQ(wc1->CheckedCast(), ac.get()); + + std::shared_ptr wc2 = + std::make_shared(wc1); + ASSERT_TRUE(wc2->IsInstanceOf(WrappedCustomizable2::kClassName())); + ASSERT_EQ(wc2->CheckedCast(), wc2.get()); + ASSERT_EQ(wc2->CheckedCast(), wc1.get()); + ASSERT_EQ(wc2->CheckedCast(), wc2.get()); + ASSERT_EQ(wc2->CheckedCast(), ac.get()); +} + +class ShallowCustomizable : public Customizable { + public: + ShallowCustomizable() { + inner_ = std::make_shared("a"); + RegisterOptions("inner", &inner_, &inner_option_info); + }; + static const char* kClassName() { return "shallow"; } + const char* Name() const override { return kClassName(); } + + private: + std::shared_ptr inner_; +}; + +TEST_F(CustomizableTest, TestStringDepth) { + ConfigOptions shallow = config_options_; + std::unique_ptr c(new ShallowCustomizable()); + std::string opt_str; + shallow.depth = ConfigOptions::Depth::kDepthShallow; + ASSERT_OK(c->GetOptionString(shallow, &opt_str)); + ASSERT_EQ(opt_str, "inner=a;"); + shallow.depth = ConfigOptions::Depth::kDepthDetailed; + ASSERT_OK(c->GetOptionString(shallow, &opt_str)); + ASSERT_NE(opt_str, "inner=a;"); +} + +// Tests that we only get a new customizable when it changes +TEST_F(CustomizableTest, NewCustomizableTest) { + std::unique_ptr base(new SimpleConfigurable()); + A_count = 0; + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_1;int=1;bool=true}")); + SimpleOptions* simple = base->GetOptions("simple"); + ASSERT_NE(simple, nullptr); + ASSERT_NE(simple->cu, nullptr); + ASSERT_EQ(A_count, 1); // Created one A + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_1;int=1;bool=false}")); + ASSERT_EQ(A_count, 2); // Create another A_1 + ASSERT_OK(base->ConfigureFromString(config_options_, + "unique={id=A_2;int=1;bool=false}")); + ASSERT_EQ(A_count, 3); // Created another A + ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id=")); + ASSERT_EQ(simple->cu, nullptr); + ASSERT_EQ(A_count, 3); +} + +TEST_F(CustomizableTest, IgnoreUnknownObjects) { + ConfigOptions ignore = config_options_; + std::shared_ptr shared; + std::unique_ptr unique; + TestCustomizable* pointer = nullptr; + ignore.ignore_unsupported_options = false; + ASSERT_NOK( + LoadSharedObject(ignore, "Unknown", nullptr, &shared)); + ASSERT_NOK( + LoadUniqueObject(ignore, "Unknown", nullptr, &unique)); + ASSERT_NOK( + LoadStaticObject(ignore, "Unknown", nullptr, &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ignore.ignore_unsupported_options = true; + ASSERT_OK( + LoadSharedObject(ignore, "Unknown", nullptr, &shared)); + ASSERT_OK( + LoadUniqueObject(ignore, "Unknown", nullptr, &unique)); + ASSERT_OK( + LoadStaticObject(ignore, "Unknown", nullptr, &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ASSERT_OK(LoadSharedObject(ignore, "id=Unknown", nullptr, + &shared)); + ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown", nullptr, + &unique)); + ASSERT_OK(LoadStaticObject(ignore, "id=Unknown", nullptr, + &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ASSERT_OK(LoadSharedObject(ignore, "id=Unknown;option=bad", + nullptr, &shared)); + ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown;option=bad", + nullptr, &unique)); + ASSERT_OK(LoadStaticObject(ignore, "id=Unknown;option=bad", + nullptr, &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); +} + +TEST_F(CustomizableTest, FactoryFunctionTest) { + std::shared_ptr shared; + std::unique_ptr unique; + TestCustomizable* pointer = nullptr; + ConfigOptions ignore = config_options_; + ignore.ignore_unsupported_options = false; + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &shared)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &unique)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &pointer)); + ASSERT_NE(shared.get(), nullptr); + ASSERT_NE(unique.get(), nullptr); + ASSERT_NE(pointer, nullptr); + delete pointer; + pointer = nullptr; + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &shared)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &unique)); + ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &pointer)); + ASSERT_EQ(shared.get(), nullptr); + ASSERT_EQ(unique.get(), nullptr); + ASSERT_EQ(pointer, nullptr); + ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &shared)); + ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &unique)); + ASSERT_NOK( + TestCustomizable::CreateFromString(ignore, "option=bad", &pointer)); + ASSERT_EQ(pointer, nullptr); +} + +TEST_F(CustomizableTest, MutableOptionsTest) { + static std::unordered_map mutable_option_info = { + {"mutable", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}}; + static std::unordered_map immutable_option_info = + {{"immutable", + OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone)}}; + + class MutableCustomizable : public Customizable { + private: + std::shared_ptr mutable_; + std::shared_ptr immutable_; + + public: + MutableCustomizable() { + RegisterOptions("mutable", &mutable_, &mutable_option_info); + RegisterOptions("immutable", &immutable_, &immutable_option_info); + } + const char* Name() const override { return "MutableCustomizable"; } + }; + MutableCustomizable mc; + + ConfigOptions options = config_options_; + ASSERT_FALSE(mc.IsPrepared()); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=B;}")); + ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=A; int=10}")); + auto* mm = mc.GetOptions>("mutable"); + auto* im = mc.GetOptions>("immutable"); + ASSERT_NE(mm, nullptr); + ASSERT_NE(mm->get(), nullptr); + ASSERT_NE(im, nullptr); + ASSERT_NE(im->get(), nullptr); + + // Now only deal with mutable options + options.mutable_options_only = true; + + // Setting nested immutable customizable options fails + ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{id=B;}")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable.id", "B")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable.bool", "true")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable", "bool=true")); + ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{int=11;bool=true}")); + auto* im_a = im->get()->GetOptions("A"); + ASSERT_NE(im_a, nullptr); + ASSERT_EQ(im_a->i, 10); + ASSERT_EQ(im_a->b, false); + + // Setting nested mutable customizable options succeeds but the object did not + // change + ASSERT_OK(mc.ConfigureOption(options, "immutable.int", "11")); + ASSERT_EQ(im_a->i, 11); + ASSERT_EQ(im_a, im->get()->GetOptions("A")); + + // The mutable configurable itself can be changed + ASSERT_OK(mc.ConfigureOption(options, "mutable.id", "A")); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "A")); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=A}")); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}")); + + // The Nested options in the mutable object can be changed + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}")); + auto* mm_a = mm->get()->GetOptions("A"); + ASSERT_EQ(mm_a->b, true); + ASSERT_OK(mc.ConfigureOption(options, "mutable", "{int=11;bool=false}")); + mm_a = mm->get()->GetOptions("A"); + ASSERT_EQ(mm_a->i, 11); + ASSERT_EQ(mm_a->b, false); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE +// This method loads existing test classes into the ObjectRegistry +static int RegisterTestObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + size_t num_types; + library.Register( + "MockTable", + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new mock::MockTableFactory()); + return guard->get(); + }); + return static_cast(library.GetFactoryCount(&num_types)); +} + +static int RegisterLocalObjects(ObjectLibrary& library, + const std::string& /*arg*/) { + size_t num_types; + // Load any locally defined objects here + return static_cast(library.GetFactoryCount(&num_types)); +} + +class LoadCustomizableTest : public testing::Test { + public: + LoadCustomizableTest() { config_options_.ignore_unsupported_options = false; } + bool RegisterTests(const std::string& arg) { +#ifndef ROCKSDB_LITE + config_options_.registry->AddLibrary("custom-tests", RegisterTestObjects, + arg); + config_options_.registry->AddLibrary("local-tests", RegisterLocalObjects, + arg); + return true; +#else + (void)arg; + return false; +#endif // !ROCKSDB_LITE + } + + protected: + DBOptions db_opts_; + ColumnFamilyOptions cf_opts_; + ConfigOptions config_options_; +}; + +TEST_F(LoadCustomizableTest, LoadTableFactoryTest) { + std::shared_ptr factory; + ASSERT_NOK( + TableFactory::CreateFromString(config_options_, "MockTable", &factory)); + ASSERT_OK(TableFactory::CreateFromString( + config_options_, TableFactory::kBlockBasedTableName(), &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), TableFactory::kBlockBasedTableName()); + + if (RegisterTests("Test")) { + ASSERT_OK( + TableFactory::CreateFromString(config_options_, "MockTable", &factory)); + ASSERT_NE(factory, nullptr); + ASSERT_STREQ(factory->Name(), "MockTable"); + } +} +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff --git a/options/db_options.cc b/options/db_options.cc index 4b7dba96d0a..00ff6aacb49 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -17,6 +17,7 @@ #include "rocksdb/file_system.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/sst_file_manager.h" +#include "rocksdb/system_clock.h" #include "rocksdb/utilities/options_type.h" #include "rocksdb/wal_filter.h" #include "util/string_util.h" @@ -136,6 +137,7 @@ static std::unordered_map std::shared_ptr statistics; std::vector db_paths; std::vector> listeners; + FileTypeSet checksum_handoff_file_types; */ {"advise_random_on_open", {offsetof(struct ImmutableDBOptions, advise_random_on_open), @@ -198,6 +200,15 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, paranoid_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"flush_verify_memtable_count", + {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"track_and_verify_wals_in_manifest", + {offsetof(struct ImmutableDBOptions, + track_and_verify_wals_in_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"skip_log_error_on_recovery", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, @@ -265,11 +276,11 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, wal_dir), OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"WAL_size_limit_MB", - {offsetof(struct ImmutableDBOptions, wal_size_limit_mb), + {offsetof(struct ImmutableDBOptions, WAL_size_limit_MB), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"WAL_ttl_seconds", - {offsetof(struct ImmutableDBOptions, wal_ttl_seconds), + {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"max_manifest_file_size", @@ -388,6 +399,9 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, bgerror_resume_retry_interval), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"db_host_id", + {offsetof(struct ImmutableDBOptions, db_host_id), OptionType::kString, + OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}}, // The following properties were handled as special cases in ParseOption // This means that the properties could be read from the options file // but never written to the file or compared to each other. @@ -397,9 +411,8 @@ static std::unordered_map (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever), // Parse the input value as a RateLimiter [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto limiter = - reinterpret_cast*>(addr); + const std::string& value, void* addr) { + auto limiter = static_cast*>(addr); limiter->reset(NewGenericRateLimiter( static_cast(ParseUint64(value)))); return Status::OK(); @@ -409,11 +422,12 @@ static std::unordered_map OptionVerificationType::kNormal, (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever), // Parse the input value as an Env - [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { - auto old_env = reinterpret_cast(addr); // Get the old value + [](const ConfigOptions& opts, const std::string& /*name*/, + const std::string& value, void* addr) { + auto old_env = static_cast(addr); // Get the old value Env* new_env = *old_env; // Set new to old - Status s = Env::LoadEnv(value, &new_env); // Update new value + Status s = Env::CreateFromString(opts, value, + &new_env); // Update new value if (s.ok()) { // It worked *old_env = new_env; // Update the old one } @@ -433,10 +447,9 @@ const std::string OptionsHelper::kDBOptionsName = "DBOptions"; class MutableDBConfigurable : public Configurable { public: - MutableDBConfigurable(const MutableDBOptions& mdb) { + explicit MutableDBConfigurable(const MutableDBOptions& mdb) { mutable_ = mdb; - ConfigurableHelper::RegisterOptions(*this, &mutable_, - &db_mutable_options_type_info); + RegisterOptions(&mutable_, &db_mutable_options_type_info); } protected: @@ -445,7 +458,7 @@ class MutableDBConfigurable : public Configurable { class DBOptionsConfigurable : public MutableDBConfigurable { public: - DBOptionsConfigurable(const DBOptions& opts) + explicit DBOptionsConfigurable(const DBOptions& opts) : MutableDBConfigurable(MutableDBOptions(opts)), db_options_(opts) { // The ImmutableDBOptions currently requires the env to be non-null. Make // sure it is @@ -456,8 +469,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable { copy.env = Env::Default(); immutable_ = ImmutableDBOptions(copy); } - ConfigurableHelper::RegisterOptions(*this, &immutable_, - &db_immutable_options_type_info); + RegisterOptions(&immutable_, &db_immutable_options_type_info); } protected: @@ -465,8 +477,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable { const ConfigOptions& config_options, const std::unordered_map& opts_map, std::unordered_map* unused) override { - Status s = ConfigurableHelper::ConfigureOptions(config_options, *this, - opts_map, unused); + Status s = Configurable::ConfigureOptions(config_options, opts_map, unused); if (s.ok()) { db_options_ = BuildDBOptions(immutable_, mutable_); s = PrepareOptions(config_options); @@ -505,8 +516,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) create_missing_column_families(options.create_missing_column_families), error_if_exists(options.error_if_exists), paranoid_checks(options.paranoid_checks), + flush_verify_memtable_count(options.flush_verify_memtable_count), + track_and_verify_wals_in_manifest( + options.track_and_verify_wals_in_manifest), env(options.env), - fs(options.env->GetFileSystem()), rate_limiter(options.rate_limiter), sst_file_manager(options.sst_file_manager), info_log(options.info_log), @@ -523,8 +536,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) recycle_log_file_num(options.recycle_log_file_num), max_manifest_file_size(options.max_manifest_file_size), table_cache_numshardbits(options.table_cache_numshardbits), - wal_ttl_seconds(options.WAL_ttl_seconds), - wal_size_limit_mb(options.WAL_size_limit_MB), + WAL_ttl_seconds(options.WAL_ttl_seconds), + WAL_size_limit_MB(options.WAL_size_limit_MB), max_write_batch_group_size_bytes( options.max_write_batch_group_size_bytes), manifest_preallocation_size(options.manifest_preallocation_size), @@ -579,7 +592,19 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) max_bgerror_resume_count(options.max_bgerror_resume_count), bgerror_resume_retry_interval(options.bgerror_resume_retry_interval), allow_data_in_errors(options.allow_data_in_errors), - disable_manifest_sync(options.disable_manifest_sync) { + disable_manifest_sync(options.disable_manifest_sync), + db_host_id(options.db_host_id), + checksum_handoff_file_types(options.checksum_handoff_file_types), + compaction_service(options.compaction_service) { + stats = statistics.get(); + fs = env->GetFileSystem(); + if (env != nullptr) { + clock = env->GetSystemClock().get(); + } else { + clock = SystemClock::Default().get(); + } + logger = info_log.get(); + stats = statistics.get(); } void ImmutableDBOptions::Dump(Logger* log) const { @@ -589,6 +614,12 @@ void ImmutableDBOptions::Dump(Logger* log) const { create_if_missing); ROCKS_LOG_HEADER(log, " Options.paranoid_checks: %d", paranoid_checks); + ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d", + flush_verify_memtable_count); + ROCKS_LOG_HEADER(log, + " " + "Options.track_and_verify_wals_in_manifest: %d", + track_and_verify_wals_in_manifest); ROCKS_LOG_HEADER(log, " Options.env: %p", env); ROCKS_LOG_HEADER(log, " Options.fs: %s", @@ -598,7 +629,7 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.max_file_opening_threads: %d", max_file_opening_threads); ROCKS_LOG_HEADER(log, " Options.statistics: %p", - statistics.get()); + stats); ROCKS_LOG_HEADER(log, " Options.use_fsync: %d", use_fsync); ROCKS_LOG_HEADER( @@ -638,10 +669,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { table_cache_numshardbits); ROCKS_LOG_HEADER(log, " Options.WAL_ttl_seconds: %" PRIu64, - wal_ttl_seconds); + WAL_ttl_seconds); ROCKS_LOG_HEADER(log, " Options.WAL_size_limit_MB: %" PRIu64, - wal_size_limit_mb); + WAL_size_limit_MB); ROCKS_LOG_HEADER(log, " " "Options.max_write_batch_group_size_bytes: %" PRIu64, @@ -739,6 +770,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { allow_data_in_errors); ROCKS_LOG_HEADER(log, " Options.disable_manifest_sync: %d", disable_manifest_sync); + ROCKS_LOG_HEADER(log, " Options.db_host_id: %s", + db_host_id.c_str()); } MutableDBOptions::MutableDBOptions() @@ -827,4 +860,27 @@ void MutableDBOptions::Dump(Logger* log) const { max_background_flushes); } +#ifndef ROCKSDB_LITE +Status GetMutableDBOptionsFromStrings( + const MutableDBOptions& base_options, + const std::unordered_map& options_map, + MutableDBOptions* new_options) { + assert(new_options); + *new_options = base_options; + ConfigOptions config_options; + Status s = OptionTypeInfo::ParseType( + config_options, options_map, db_mutable_options_type_info, new_options); + if (!s.ok()) { + *new_options = base_options; + } + return s; +} + +Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, + const MutableDBOptions& mutable_opts, + std::string* opt_string) { + return OptionTypeInfo::SerializeType( + config_options, db_mutable_options_type_info, &mutable_opts, opt_string); +} +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/db_options.h b/options/db_options.h index 61642e42611..cdd08e80a11 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -11,6 +11,7 @@ #include "rocksdb/options.h" namespace ROCKSDB_NAMESPACE { +class SystemClock; struct ImmutableDBOptions { static const char* kName() { return "ImmutableDBOptions"; } @@ -23,8 +24,9 @@ struct ImmutableDBOptions { bool create_missing_column_families; bool error_if_exists; bool paranoid_checks; + bool flush_verify_memtable_count; + bool track_and_verify_wals_in_manifest; Env* env; - std::shared_ptr fs; std::shared_ptr rate_limiter; std::shared_ptr sst_file_manager; std::shared_ptr info_log; @@ -41,8 +43,8 @@ struct ImmutableDBOptions { size_t recycle_log_file_num; uint64_t max_manifest_file_size; int table_cache_numshardbits; - uint64_t wal_ttl_seconds; - uint64_t wal_size_limit_mb; + uint64_t WAL_ttl_seconds; + uint64_t WAL_size_limit_MB; uint64_t max_write_batch_group_size_bytes; size_t manifest_preallocation_size; bool allow_mmap_reads; @@ -93,6 +95,14 @@ struct ImmutableDBOptions { uint64_t bgerror_resume_retry_interval; bool allow_data_in_errors; bool disable_manifest_sync; + std::string db_host_id; + FileTypeSet checksum_handoff_file_types; + // Convenience/Helper objects that are not part of the base DBOptions + std::shared_ptr fs; + SystemClock* clock; + Statistics* stats; + Logger* logger; + std::shared_ptr compaction_service; }; struct MutableDBOptions { @@ -123,4 +133,15 @@ struct MutableDBOptions { int max_background_flushes; }; +#ifndef ROCKSDB_LITE +Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, + const MutableDBOptions& mutable_opts, + std::string* opt_string); + +Status GetMutableDBOptionsFromStrings( + const MutableDBOptions& base_options, + const std::unordered_map& options_map, + MutableDBOptions* new_options); +#endif // ROCKSDB_LITE + } // namespace ROCKSDB_NAMESPACE diff --git a/options/options.cc b/options/options.cc index cf00059b71d..4faee64b4b1 100644 --- a/options/options.cc +++ b/options/options.cc @@ -92,7 +92,10 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) enable_blob_files(options.enable_blob_files), min_blob_size(options.min_blob_size), blob_file_size(options.blob_file_size), - blob_compression_type(options.blob_compression_type) { + blob_compression_type(options.blob_compression_type), + enable_blob_garbage_collection(options.enable_blob_garbage_collection), + blob_garbage_collection_age_cutoff( + options.blob_garbage_collection_age_cutoff) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -198,6 +201,11 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.bottommost_compression_opts.enabled: %s", bottommost_compression_opts.enabled ? "true" : "false"); + ROCKS_LOG_HEADER( + log, + " Options.bottommost_compression_opts.max_dict_buffer_bytes: " + "%" PRIu64, + bottommost_compression_opts.max_dict_buffer_bytes); ROCKS_LOG_HEADER(log, " Options.compression_opts.window_bits: %d", compression_opts.window_bits); ROCKS_LOG_HEADER(log, " Options.compression_opts.level: %d", @@ -219,6 +227,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.compression_opts.enabled: %s", compression_opts.enabled ? "true" : "false"); + ROCKS_LOG_HEADER(log, + " Options.compression_opts.max_dict_buffer_bytes: " + "%" PRIu64, + compression_opts.max_dict_buffer_bytes); ROCKS_LOG_HEADER(log, " Options.level0_file_num_compaction_trigger: %d", level0_file_num_compaction_trigger); ROCKS_LOG_HEADER(log, " Options.level0_slowdown_writes_trigger: %d", @@ -383,6 +395,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const { blob_file_size); ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", CompressionTypeToString(blob_compression_type).c_str()); + ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", + enable_blob_garbage_collection ? "true" : "false"); + ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", + blob_garbage_collection_age_cutoff); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { diff --git a/options/options_helper.cc b/options/options_helper.cc index e4a53de8bd2..0aaa73b2543 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -28,6 +28,20 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { +ConfigOptions::ConfigOptions() +#ifndef ROCKSDB_LITE + : registry(ObjectRegistry::NewInstance()) +#endif +{ + env = Env::Default(); +} + +ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) { +#ifndef ROCKSDB_LITE + registry = ObjectRegistry::NewInstance(); +#endif +} + Status ValidateOptions(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) { Status s; @@ -51,6 +65,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.create_missing_column_families; options.error_if_exists = immutable_db_options.error_if_exists; options.paranoid_checks = immutable_db_options.paranoid_checks; + options.flush_verify_memtable_count = + immutable_db_options.flush_verify_memtable_count; + options.track_and_verify_wals_in_manifest = + immutable_db_options.track_and_verify_wals_in_manifest; options.env = immutable_db_options.env; options.rate_limiter = immutable_db_options.rate_limiter; options.sst_file_manager = immutable_db_options.sst_file_manager; @@ -84,8 +102,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.max_manifest_file_size = immutable_db_options.max_manifest_file_size; options.table_cache_numshardbits = immutable_db_options.table_cache_numshardbits; - options.WAL_ttl_seconds = immutable_db_options.wal_ttl_seconds; - options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb; + options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds; + options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB; options.manifest_preallocation_size = immutable_db_options.manifest_preallocation_size; options.allow_mmap_reads = immutable_db_options.allow_mmap_reads; @@ -166,6 +184,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.bgerror_resume_retry_interval = immutable_db_options.bgerror_resume_retry_interval; options.disable_manifest_sync = immutable_db_options.disable_manifest_sync; + options.db_host_id = immutable_db_options.db_host_id; + options.allow_data_in_errors = immutable_db_options.allow_data_in_errors; + options.checksum_handoff_file_types = + immutable_db_options.checksum_handoff_file_types; return options; } @@ -173,81 +195,115 @@ ColumnFamilyOptions BuildColumnFamilyOptions( const ColumnFamilyOptions& options, const MutableCFOptions& mutable_cf_options) { ColumnFamilyOptions cf_opts(options); + UpdateColumnFamilyOptions(mutable_cf_options, &cf_opts); + // TODO(yhchiang): find some way to handle the following derived options + // * max_file_size + return cf_opts; +} +void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, + ColumnFamilyOptions* cf_opts) { // Memtable related options - cf_opts.write_buffer_size = mutable_cf_options.write_buffer_size; - cf_opts.max_write_buffer_number = mutable_cf_options.max_write_buffer_number; - cf_opts.arena_block_size = mutable_cf_options.arena_block_size; - cf_opts.memtable_prefix_bloom_size_ratio = - mutable_cf_options.memtable_prefix_bloom_size_ratio; - cf_opts.memtable_whole_key_filtering = - mutable_cf_options.memtable_whole_key_filtering; - cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size; - cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges; - cf_opts.inplace_update_num_locks = - mutable_cf_options.inplace_update_num_locks; - cf_opts.prefix_extractor = mutable_cf_options.prefix_extractor; + cf_opts->write_buffer_size = moptions.write_buffer_size; + cf_opts->max_write_buffer_number = moptions.max_write_buffer_number; + cf_opts->arena_block_size = moptions.arena_block_size; + cf_opts->memtable_prefix_bloom_size_ratio = + moptions.memtable_prefix_bloom_size_ratio; + cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; + cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; + cf_opts->max_successive_merges = moptions.max_successive_merges; + cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks; + cf_opts->prefix_extractor = moptions.prefix_extractor; // Compaction related options - cf_opts.disable_auto_compactions = - mutable_cf_options.disable_auto_compactions; - cf_opts.soft_pending_compaction_bytes_limit = - mutable_cf_options.soft_pending_compaction_bytes_limit; - cf_opts.hard_pending_compaction_bytes_limit = - mutable_cf_options.hard_pending_compaction_bytes_limit; - cf_opts.level0_file_num_compaction_trigger = - mutable_cf_options.level0_file_num_compaction_trigger; - cf_opts.level0_slowdown_writes_trigger = - mutable_cf_options.level0_slowdown_writes_trigger; - cf_opts.level0_stop_writes_trigger = - mutable_cf_options.level0_stop_writes_trigger; - cf_opts.max_compaction_bytes = mutable_cf_options.max_compaction_bytes; - cf_opts.target_file_size_base = mutable_cf_options.target_file_size_base; - cf_opts.target_file_size_multiplier = - mutable_cf_options.target_file_size_multiplier; - cf_opts.max_bytes_for_level_base = - mutable_cf_options.max_bytes_for_level_base; - cf_opts.max_bytes_for_level_multiplier = - mutable_cf_options.max_bytes_for_level_multiplier; - cf_opts.ttl = mutable_cf_options.ttl; - cf_opts.periodic_compaction_seconds = - mutable_cf_options.periodic_compaction_seconds; - - cf_opts.max_bytes_for_level_multiplier_additional.clear(); - for (auto value : - mutable_cf_options.max_bytes_for_level_multiplier_additional) { - cf_opts.max_bytes_for_level_multiplier_additional.emplace_back(value); + cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; + cf_opts->soft_pending_compaction_bytes_limit = + moptions.soft_pending_compaction_bytes_limit; + cf_opts->hard_pending_compaction_bytes_limit = + moptions.hard_pending_compaction_bytes_limit; + cf_opts->level0_file_num_compaction_trigger = + moptions.level0_file_num_compaction_trigger; + cf_opts->level0_slowdown_writes_trigger = + moptions.level0_slowdown_writes_trigger; + cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger; + cf_opts->max_compaction_bytes = moptions.max_compaction_bytes; + cf_opts->target_file_size_base = moptions.target_file_size_base; + cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier; + cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base; + cf_opts->max_bytes_for_level_multiplier = + moptions.max_bytes_for_level_multiplier; + cf_opts->ttl = moptions.ttl; + cf_opts->periodic_compaction_seconds = moptions.periodic_compaction_seconds; + + cf_opts->max_bytes_for_level_multiplier_additional.clear(); + for (auto value : moptions.max_bytes_for_level_multiplier_additional) { + cf_opts->max_bytes_for_level_multiplier_additional.emplace_back(value); } - cf_opts.compaction_options_fifo = mutable_cf_options.compaction_options_fifo; - cf_opts.compaction_options_universal = - mutable_cf_options.compaction_options_universal; + cf_opts->compaction_options_fifo = moptions.compaction_options_fifo; + cf_opts->compaction_options_universal = moptions.compaction_options_universal; // Blob file related options - cf_opts.enable_blob_files = mutable_cf_options.enable_blob_files; - cf_opts.min_blob_size = mutable_cf_options.min_blob_size; - cf_opts.blob_file_size = mutable_cf_options.blob_file_size; - cf_opts.blob_compression_type = mutable_cf_options.blob_compression_type; + cf_opts->enable_blob_files = moptions.enable_blob_files; + cf_opts->min_blob_size = moptions.min_blob_size; + cf_opts->blob_file_size = moptions.blob_file_size; + cf_opts->blob_compression_type = moptions.blob_compression_type; + cf_opts->enable_blob_garbage_collection = + moptions.enable_blob_garbage_collection; + cf_opts->blob_garbage_collection_age_cutoff = + moptions.blob_garbage_collection_age_cutoff; // Misc options - cf_opts.max_sequential_skip_in_iterations = - mutable_cf_options.max_sequential_skip_in_iterations; - cf_opts.check_flush_compaction_key_order = - mutable_cf_options.check_flush_compaction_key_order; - cf_opts.paranoid_file_checks = mutable_cf_options.paranoid_file_checks; - cf_opts.report_bg_io_stats = mutable_cf_options.report_bg_io_stats; - cf_opts.compression = mutable_cf_options.compression; - cf_opts.compression_opts = mutable_cf_options.compression_opts; - cf_opts.bottommost_compression = mutable_cf_options.bottommost_compression; - cf_opts.bottommost_compression_opts = - mutable_cf_options.bottommost_compression_opts; - cf_opts.sample_for_compression = mutable_cf_options.sample_for_compression; - - cf_opts.table_factory = options.table_factory; + cf_opts->max_sequential_skip_in_iterations = + moptions.max_sequential_skip_in_iterations; + cf_opts->check_flush_compaction_key_order = + moptions.check_flush_compaction_key_order; + cf_opts->paranoid_file_checks = moptions.paranoid_file_checks; + cf_opts->report_bg_io_stats = moptions.report_bg_io_stats; + cf_opts->compression = moptions.compression; + cf_opts->compression_opts = moptions.compression_opts; + cf_opts->bottommost_compression = moptions.bottommost_compression; + cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts; + cf_opts->sample_for_compression = moptions.sample_for_compression; +} + +void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, + ColumnFamilyOptions* cf_opts) { + cf_opts->compaction_style = ioptions.compaction_style; + cf_opts->compaction_pri = ioptions.compaction_pri; + cf_opts->comparator = ioptions.user_comparator; + cf_opts->merge_operator = ioptions.merge_operator; + cf_opts->compaction_filter = ioptions.compaction_filter; + cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory; + cf_opts->min_write_buffer_number_to_merge = + ioptions.min_write_buffer_number_to_merge; + cf_opts->max_write_buffer_number_to_maintain = + ioptions.max_write_buffer_number_to_maintain; + cf_opts->max_write_buffer_size_to_maintain = + ioptions.max_write_buffer_size_to_maintain; + cf_opts->inplace_update_support = ioptions.inplace_update_support; + cf_opts->inplace_callback = ioptions.inplace_callback; + cf_opts->memtable_factory = ioptions.memtable_factory; + cf_opts->table_factory = ioptions.table_factory; + cf_opts->table_properties_collector_factories = + ioptions.table_properties_collector_factories; + cf_opts->bloom_locality = ioptions.bloom_locality; + cf_opts->purge_redundant_kvs_while_flush = + ioptions.purge_redundant_kvs_while_flush; + cf_opts->compression_per_level = ioptions.compression_per_level; + cf_opts->level_compaction_dynamic_level_bytes = + ioptions.level_compaction_dynamic_level_bytes; + cf_opts->num_levels = ioptions.num_levels; + cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits; + cf_opts->force_consistency_checks = ioptions.force_consistency_checks; + cf_opts->memtable_insert_with_hint_prefix_extractor = + ioptions.memtable_insert_with_hint_prefix_extractor; + cf_opts->cf_paths = ioptions.cf_paths; + cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter; + cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory; + // TODO(yhchiang): find some way to handle the following derived options // * max_file_size - - return cf_opts; } std::map @@ -298,6 +354,17 @@ std::vector GetSupportedCompressions() { return supported_compressions; } +std::vector GetSupportedDictCompressions() { + std::vector dict_compression_types; + for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) { + CompressionType t = comp_to_name.second; + if (t != kDisableCompressionOption && DictCompressionTypeSupported(t)) { + dict_compression_types.push_back(t); + } + } + return dict_compression_types; +} + #ifndef ROCKSDB_LITE bool ParseSliceTransformHelper( const std::string& kFixedPrefixName, const std::string& kCappedPrefixName, @@ -354,137 +421,144 @@ bool ParseSliceTransform( return false; } -static bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, +static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type, const std::string& value) { switch (opt_type) { case OptionType::kBoolean: - *reinterpret_cast(opt_address) = ParseBoolean("", value); + *static_cast(opt_address) = ParseBoolean("", value); break; case OptionType::kInt: - *reinterpret_cast(opt_address) = ParseInt(value); + *static_cast(opt_address) = ParseInt(value); break; case OptionType::kInt32T: - *reinterpret_cast(opt_address) = ParseInt32(value); + *static_cast(opt_address) = ParseInt32(value); break; case OptionType::kInt64T: - PutUnaligned(reinterpret_cast(opt_address), ParseInt64(value)); + PutUnaligned(static_cast(opt_address), ParseInt64(value)); break; case OptionType::kUInt: - *reinterpret_cast(opt_address) = ParseUint32(value); + *static_cast(opt_address) = ParseUint32(value); + break; + case OptionType::kUInt8T: + *static_cast(opt_address) = ParseUint8(value); break; case OptionType::kUInt32T: - *reinterpret_cast(opt_address) = ParseUint32(value); + *static_cast(opt_address) = ParseUint32(value); break; case OptionType::kUInt64T: - PutUnaligned(reinterpret_cast(opt_address), ParseUint64(value)); + PutUnaligned(static_cast(opt_address), ParseUint64(value)); break; case OptionType::kSizeT: - PutUnaligned(reinterpret_cast(opt_address), ParseSizeT(value)); + PutUnaligned(static_cast(opt_address), ParseSizeT(value)); break; case OptionType::kString: - *reinterpret_cast(opt_address) = value; + *static_cast(opt_address) = value; break; case OptionType::kDouble: - *reinterpret_cast(opt_address) = ParseDouble(value); + *static_cast(opt_address) = ParseDouble(value); break; case OptionType::kCompactionStyle: return ParseEnum( compaction_style_string_map, value, - reinterpret_cast(opt_address)); + static_cast(opt_address)); case OptionType::kCompactionPri: - return ParseEnum( - compaction_pri_string_map, value, - reinterpret_cast(opt_address)); + return ParseEnum(compaction_pri_string_map, value, + static_cast(opt_address)); case OptionType::kCompressionType: return ParseEnum( compression_type_string_map, value, - reinterpret_cast(opt_address)); + static_cast(opt_address)); case OptionType::kSliceTransform: return ParseSliceTransform( - value, reinterpret_cast*>( - opt_address)); + value, + static_cast*>(opt_address)); case OptionType::kChecksumType: - return ParseEnum( - checksum_type_string_map, value, - reinterpret_cast(opt_address)); + return ParseEnum(checksum_type_string_map, value, + static_cast(opt_address)); case OptionType::kEncodingType: - return ParseEnum( - encoding_type_string_map, value, - reinterpret_cast(opt_address)); + return ParseEnum(encoding_type_string_map, value, + static_cast(opt_address)); case OptionType::kCompactionStopStyle: return ParseEnum( compaction_stop_style_string_map, value, - reinterpret_cast(opt_address)); + static_cast(opt_address)); + case OptionType::kEncodedString: { + std::string* output_addr = static_cast(opt_address); + (Slice(value)).DecodeHex(output_addr); + break; + } default: return false; } return true; } -bool SerializeSingleOptionHelper(const char* opt_address, +bool SerializeSingleOptionHelper(const void* opt_address, const OptionType opt_type, std::string* value) { - assert(value); switch (opt_type) { case OptionType::kBoolean: - *value = *(reinterpret_cast(opt_address)) ? "true" : "false"; + *value = *(static_cast(opt_address)) ? "true" : "false"; break; case OptionType::kInt: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kInt32T: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kInt64T: { int64_t v; - GetUnaligned(reinterpret_cast(opt_address), &v); + GetUnaligned(static_cast(opt_address), &v); *value = ToString(v); } break; case OptionType::kUInt: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); + break; + case OptionType::kUInt8T: + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kUInt32T: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kUInt64T: { uint64_t v; - GetUnaligned(reinterpret_cast(opt_address), &v); + GetUnaligned(static_cast(opt_address), &v); *value = ToString(v); } break; case OptionType::kSizeT: { size_t v; - GetUnaligned(reinterpret_cast(opt_address), &v); + GetUnaligned(static_cast(opt_address), &v); *value = ToString(v); } break; case OptionType::kDouble: - *value = ToString(*(reinterpret_cast(opt_address))); + *value = ToString(*(static_cast(opt_address))); break; case OptionType::kString: - *value = EscapeOptionString( - *(reinterpret_cast(opt_address))); + *value = + EscapeOptionString(*(static_cast(opt_address))); break; case OptionType::kCompactionStyle: return SerializeEnum( compaction_style_string_map, - *(reinterpret_cast(opt_address)), value); + *(static_cast(opt_address)), value); case OptionType::kCompactionPri: return SerializeEnum( compaction_pri_string_map, - *(reinterpret_cast(opt_address)), value); + *(static_cast(opt_address)), value); case OptionType::kCompressionType: return SerializeEnum( compression_type_string_map, - *(reinterpret_cast(opt_address)), value); + *(static_cast(opt_address)), value); case OptionType::kSliceTransform: { const auto* slice_transform_ptr = - reinterpret_cast*>( + static_cast*>( opt_address); *value = slice_transform_ptr->get() ? slice_transform_ptr->get()->Name() : kNullptrString; @@ -492,7 +566,7 @@ bool SerializeSingleOptionHelper(const char* opt_address, } case OptionType::kComparator: { // it's a const pointer of const Comparator* - const auto* ptr = reinterpret_cast(opt_address); + const auto* ptr = static_cast(opt_address); // Since the user-specified comparator will be wrapped by // InternalKeyComparator, we should persist the user-specified one // instead of InternalKeyComparator. @@ -510,43 +584,42 @@ bool SerializeSingleOptionHelper(const char* opt_address, case OptionType::kCompactionFilter: { // it's a const pointer of const CompactionFilter* const auto* ptr = - reinterpret_cast(opt_address); + static_cast(opt_address); *value = *ptr ? (*ptr)->Name() : kNullptrString; break; } case OptionType::kCompactionFilterFactory: { const auto* ptr = - reinterpret_cast*>( + static_cast*>( opt_address); *value = ptr->get() ? ptr->get()->Name() : kNullptrString; break; } case OptionType::kMemTableRepFactory: { const auto* ptr = - reinterpret_cast*>( - opt_address); + static_cast*>(opt_address); *value = ptr->get() ? ptr->get()->Name() : kNullptrString; break; } case OptionType::kMergeOperator: { const auto* ptr = - reinterpret_cast*>(opt_address); + static_cast*>(opt_address); *value = ptr->get() ? ptr->get()->Name() : kNullptrString; break; } case OptionType::kFilterPolicy: { const auto* ptr = - reinterpret_cast*>(opt_address); + static_cast*>(opt_address); *value = ptr->get() ? ptr->get()->Name() : kNullptrString; break; } case OptionType::kChecksumType: return SerializeEnum( checksum_type_string_map, - *reinterpret_cast(opt_address), value); + *static_cast(opt_address), value); case OptionType::kFlushBlockPolicyFactory: { const auto* ptr = - reinterpret_cast*>( + static_cast*>( opt_address); *value = ptr->get() ? ptr->get()->Name() : kNullptrString; break; @@ -554,11 +627,16 @@ bool SerializeSingleOptionHelper(const char* opt_address, case OptionType::kEncodingType: return SerializeEnum( encoding_type_string_map, - *reinterpret_cast(opt_address), value); + *static_cast(opt_address), value); case OptionType::kCompactionStopStyle: return SerializeEnum( compaction_stop_style_string_map, - *reinterpret_cast(opt_address), value); + *static_cast(opt_address), value); + case OptionType::kEncodedString: { + const auto* ptr = static_cast(opt_address); + *value = (Slice(*ptr)).ToString(true); + break; + } default: return false; } @@ -577,32 +655,6 @@ Status ConfigureFromMap( return s; } -Status GetMutableOptionsFromStrings( - const MutableCFOptions& base_options, - const std::unordered_map& options_map, - Logger* /*info_log*/, MutableCFOptions* new_options) { - assert(new_options); - *new_options = base_options; - ConfigOptions config_options; - const auto config = CFOptionsAsConfigurable(base_options); - return ConfigureFromMap(config_options, options_map, - MutableCFOptions::kName(), - config.get(), new_options); -} - -Status GetMutableDBOptionsFromStrings( - const MutableDBOptions& base_options, - const std::unordered_map& options_map, - MutableDBOptions* new_options) { - assert(new_options); - *new_options = base_options; - ConfigOptions config_options; - - auto config = DBOptionsAsConfigurable(base_options); - return ConfigureFromMap(config_options, options_map, - MutableDBOptions::kName(), - config.get(), new_options); -} Status StringToMap(const std::string& opts_str, std::unordered_map* opts_map) { @@ -644,17 +696,11 @@ Status StringToMap(const std::string& opts_str, return Status::OK(); } -Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, - const MutableDBOptions& mutable_opts, - std::string* opt_string) { - auto config = DBOptionsAsConfigurable(mutable_opts); - return config->GetOptionString(config_options, opt_string); -} Status GetStringFromDBOptions(std::string* opt_string, const DBOptions& db_options, const std::string& delimiter) { - ConfigOptions config_options; + ConfigOptions config_options(db_options); config_options.delimiter = delimiter; return GetStringFromDBOptions(config_options, db_options, opt_string); } @@ -668,14 +714,6 @@ Status GetStringFromDBOptions(const ConfigOptions& config_options, return config->GetOptionString(config_options, opt_string); } -Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, - const MutableCFOptions& mutable_opts, - std::string* opt_string) { - assert(opt_string); - opt_string->clear(); - const auto config = CFOptionsAsConfigurable(mutable_opts); - return config->GetOptionString(config_options, opt_string); -} Status GetStringFromColumnFamilyOptions(std::string* opt_string, const ColumnFamilyOptions& cf_options, @@ -767,7 +805,7 @@ Status GetDBOptionsFromMap( const std::unordered_map& opts_map, DBOptions* new_options, bool input_strings_escaped, bool ignore_unknown_options) { - ConfigOptions config_options; + ConfigOptions config_options(base_options); config_options.input_strings_escaped = input_strings_escaped; config_options.ignore_unknown_options = ignore_unknown_options; return GetDBOptionsFromMap(config_options, base_options, opts_map, @@ -795,7 +833,7 @@ Status GetDBOptionsFromMap( Status GetDBOptionsFromString(const DBOptions& base_options, const std::string& opts_str, DBOptions* new_options) { - ConfigOptions config_options; + ConfigOptions config_options(base_options); config_options.input_strings_escaped = false; config_options.ignore_unknown_options = false; @@ -819,7 +857,7 @@ Status GetDBOptionsFromString(const ConfigOptions& config_options, Status GetOptionsFromString(const Options& base_options, const std::string& opts_str, Options* new_options) { - ConfigOptions config_options; + ConfigOptions config_options(base_options); config_options.input_strings_escaped = false; config_options.ignore_unknown_options = false; @@ -834,6 +872,7 @@ Status GetOptionsFromString(const ConfigOptions& config_options, std::unordered_map unused_opts; std::unordered_map opts_map; + assert(new_options); *new_options = base_options; Status s = StringToMap(opts_str, &opts_map); if (!s.ok()) { @@ -946,7 +985,7 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options, return Status::OK(); } try { - char* opt_addr = reinterpret_cast(opt_ptr) + offset_; + void* opt_addr = static_cast(opt_ptr) + offset_; const std::string& opt_value = config_options.input_strings_escaped ? UnescapeOptionString(value) : value; @@ -988,28 +1027,56 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options, } } +Status OptionTypeInfo::ParseType( + const ConfigOptions& config_options, const std::string& opts_str, + const std::unordered_map& type_map, + void* opt_addr, std::unordered_map* unused) { + std::unordered_map opts_map; + Status status = StringToMap(opts_str, &opts_map); + if (!status.ok()) { + return status; + } else { + return ParseType(config_options, opts_map, type_map, opt_addr, unused); + } +} + +Status OptionTypeInfo::ParseType( + const ConfigOptions& config_options, + const std::unordered_map& opts_map, + const std::unordered_map& type_map, + void* opt_addr, std::unordered_map* unused) { + for (const auto& opts_iter : opts_map) { + std::string opt_name; + const auto* opt_info = Find(opts_iter.first, type_map, &opt_name); + if (opt_info != nullptr) { + Status status = + opt_info->Parse(config_options, opt_name, opts_iter.second, opt_addr); + if (!status.ok()) { + return status; + } + } else if (unused != nullptr) { + (*unused)[opts_iter.first] = opts_iter.second; + } else if (!config_options.ignore_unknown_options) { + return Status::NotFound("Unrecognized option", opts_iter.first); + } + } + return Status::OK(); +} + Status OptionTypeInfo::ParseStruct( const ConfigOptions& config_options, const std::string& struct_name, const std::unordered_map* struct_map, - const std::string& opt_name, const std::string& opt_value, char* opt_addr) { + const std::string& opt_name, const std::string& opt_value, void* opt_addr) { assert(struct_map); Status status; if (opt_name == struct_name || EndsWith(opt_name, "." + struct_name)) { // This option represents the entire struct - std::unordered_map opt_map; - status = StringToMap(opt_value, &opt_map); - for (const auto& map_iter : opt_map) { - if (!status.ok()) { - break; - } - const auto iter = struct_map->find(map_iter.first); - if (iter != struct_map->end()) { - status = iter->second.Parse(config_options, map_iter.first, - map_iter.second, opt_addr); - } else { - status = Status::InvalidArgument("Unrecognized option", - struct_name + "." + map_iter.first); - } + std::unordered_map unused; + status = + ParseType(config_options, opt_value, *struct_map, opt_addr, &unused); + if (status.ok() && !unused.empty()) { + status = Status::InvalidArgument( + "Unrecognized option", struct_name + "." + unused.begin()->first); } } else if (StartsWith(opt_name, struct_name + ".")) { // This option represents a nested field in the struct (e.g, struct.field) @@ -1041,7 +1108,7 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options, std::string* opt_value) const { // If the option is no longer used in rocksdb and marked as deprecated, // we skip it in the serialization. - const char* opt_addr = reinterpret_cast(opt_ptr) + offset_; + const void* opt_addr = static_cast(opt_ptr) + offset_; if (opt_addr == nullptr || IsDeprecated()) { return Status::OK(); } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) { @@ -1050,6 +1117,19 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options, return serialize_func_(config_options, opt_name, opt_addr, opt_value); } else if (SerializeSingleOptionHelper(opt_addr, type_, opt_value)) { return Status::OK(); + } else if (IsCustomizable()) { + const Customizable* custom = AsRawPointer(opt_ptr); + if (custom == nullptr) { + *opt_value = kNullptrString; + } else if (IsEnabled(OptionTypeFlags::kStringNameOnly) && + !config_options.IsDetailed()) { + *opt_value = custom->GetId(); + } else { + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + *opt_value = custom->ToString(embedded); + } + return Status::OK(); } else if (IsConfigurable()) { const Configurable* config = AsRawPointer(opt_ptr); if (config != nullptr) { @@ -1063,10 +1143,31 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options, } } +Status OptionTypeInfo::SerializeType( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* opt_addr, std::string* result) { + Status status; + for (const auto& iter : type_map) { + std::string single; + const auto& opt_info = iter.second; + if (opt_info.ShouldSerialize()) { + status = + opt_info.Serialize(config_options, iter.first, opt_addr, &single); + if (!status.ok()) { + return status; + } else { + result->append(iter.first + "=" + single + config_options.delimiter); + } + } + } + return status; +} + Status OptionTypeInfo::SerializeStruct( const ConfigOptions& config_options, const std::string& struct_name, const std::unordered_map* struct_map, - const std::string& opt_name, const char* opt_addr, std::string* value) { + const std::string& opt_name, const void* opt_addr, std::string* value) { assert(struct_map); Status status; if (EndsWith(opt_name, struct_name)) { @@ -1077,19 +1178,12 @@ Status OptionTypeInfo::SerializeStruct( // This option represents the entire struct std::string result; - for (const auto& iter : *struct_map) { - std::string single; - const auto& opt_info = iter.second; - if (opt_info.ShouldSerialize()) { - status = opt_info.Serialize(embedded, iter.first, opt_addr, &single); - if (!status.ok()) { - return status; - } else { - result.append(iter.first + "=" + single + embedded.delimiter); - } - } + status = SerializeType(embedded, *struct_map, opt_addr, &result); + if (!status.ok()) { + return status; + } else { + *value = "{" + result + "}"; } - *value = "{" + result + "}"; } else if (StartsWith(opt_name, struct_name + ".")) { // This option represents a nested field in the struct (e.g, struct.field) std::string elem_name; @@ -1115,17 +1209,16 @@ Status OptionTypeInfo::SerializeStruct( } template -bool IsOptionEqual(const char* offset1, const char* offset2) { - return (*reinterpret_cast(offset1) == - *reinterpret_cast(offset2)); +bool IsOptionEqual(const void* offset1, const void* offset2) { + return (*static_cast(offset1) == *static_cast(offset2)); } static bool AreEqualDoubles(const double a, const double b) { return (fabs(a - b) < 0.00001); } -static bool AreOptionsEqual(OptionType type, const char* this_offset, - const char* that_offset) { +static bool AreOptionsEqual(OptionType type, const void* this_offset, + const void* that_offset) { switch (type) { case OptionType::kBoolean: return IsOptionEqual(this_offset, that_offset); @@ -1137,29 +1230,31 @@ static bool AreOptionsEqual(OptionType type, const char* this_offset, return IsOptionEqual(this_offset, that_offset); case OptionType::kInt64T: { int64_t v1, v2; - GetUnaligned(reinterpret_cast(this_offset), &v1); - GetUnaligned(reinterpret_cast(that_offset), &v2); + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); return (v1 == v2); } + case OptionType::kUInt8T: + return IsOptionEqual(this_offset, that_offset); case OptionType::kUInt32T: return IsOptionEqual(this_offset, that_offset); case OptionType::kUInt64T: { uint64_t v1, v2; - GetUnaligned(reinterpret_cast(this_offset), &v1); - GetUnaligned(reinterpret_cast(that_offset), &v2); + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); return (v1 == v2); } case OptionType::kSizeT: { size_t v1, v2; - GetUnaligned(reinterpret_cast(this_offset), &v1); - GetUnaligned(reinterpret_cast(that_offset), &v2); + GetUnaligned(static_cast(this_offset), &v1); + GetUnaligned(static_cast(that_offset), &v2); return (v1 == v2); } case OptionType::kString: return IsOptionEqual(this_offset, that_offset); case OptionType::kDouble: - return AreEqualDoubles(*reinterpret_cast(this_offset), - *reinterpret_cast(that_offset)); + return AreEqualDoubles(*static_cast(this_offset), + *static_cast(that_offset)); case OptionType::kCompactionStyle: return IsOptionEqual(this_offset, that_offset); case OptionType::kCompactionStopStyle: @@ -1172,6 +1267,8 @@ static bool AreOptionsEqual(OptionType type, const char* this_offset, return IsOptionEqual(this_offset, that_offset); case OptionType::kEncodingType: return IsOptionEqual(this_offset, that_offset); + case OptionType::kEncodedString: + return IsOptionEqual(this_offset, that_offset); default: return false; } // End switch @@ -1186,8 +1283,8 @@ bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options, if (!config_options.IsCheckEnabled(level)) { return true; // If the sanity level is not being checked, skip it } - const auto this_addr = reinterpret_cast(this_ptr) + offset_; - const auto that_addr = reinterpret_cast(that_ptr) + offset_; + const void* this_addr = static_cast(this_ptr) + offset_; + const void* that_addr = static_cast(that_ptr) + offset_; if (this_addr == nullptr || that_addr == nullptr) { if (this_addr == that_addr) { return true; @@ -1227,25 +1324,35 @@ bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options, return false; } +bool OptionTypeInfo::TypesAreEqual( + const ConfigOptions& config_options, + const std::unordered_map& type_map, + const void* this_addr, const void* that_addr, std::string* mismatch) { + for (const auto& iter : type_map) { + const auto& opt_info = iter.second; + if (!opt_info.AreEqual(config_options, iter.first, this_addr, that_addr, + mismatch)) { + return false; + } + } + return true; +} + bool OptionTypeInfo::StructsAreEqual( const ConfigOptions& config_options, const std::string& struct_name, const std::unordered_map* struct_map, - const std::string& opt_name, const char* this_addr, const char* that_addr, + const std::string& opt_name, const void* this_addr, const void* that_addr, std::string* mismatch) { assert(struct_map); bool matches = true; std::string result; if (EndsWith(opt_name, struct_name)) { // This option represents the entire struct - for (const auto& iter : *struct_map) { - const auto& opt_info = iter.second; - - matches = opt_info.AreEqual(config_options, iter.first, this_addr, - that_addr, &result); - if (!matches) { - *mismatch = struct_name + "." + result; - return false; - } + matches = TypesAreEqual(config_options, *struct_map, this_addr, that_addr, + &result); + if (!matches) { + *mismatch = struct_name + "." + result; + return false; } } else if (StartsWith(opt_name, struct_name + ".")) { // This option represents a nested field in the struct (e.g, struct.field) diff --git a/options/options_helper.h b/options/options_helper.h index 4323d5f8e50..a16c265ede9 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -18,6 +18,7 @@ namespace ROCKSDB_NAMESPACE { struct ColumnFamilyOptions; struct ConfigOptions; struct DBOptions; +struct ImmutableCFOptions; struct ImmutableDBOptions; struct MutableDBOptions; struct MutableCFOptions; @@ -25,6 +26,8 @@ struct Options; std::vector GetSupportedCompressions(); +std::vector GetSupportedDictCompressions(); + // Checks that the combination of DBOptions and ColumnFamilyOptions are valid Status ValidateOptions(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts); @@ -36,6 +39,11 @@ ColumnFamilyOptions BuildColumnFamilyOptions( const ColumnFamilyOptions& ioptions, const MutableCFOptions& mutable_cf_options); +void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, + ColumnFamilyOptions* cf_opts); +void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, + ColumnFamilyOptions* cf_opts); + #ifndef ROCKSDB_LITE std::unique_ptr DBOptionsAsConfigurable( const MutableDBOptions& opts); @@ -46,23 +54,6 @@ std::unique_ptr CFOptionsAsConfigurable( const ColumnFamilyOptions& opts, const std::unordered_map* opt_map = nullptr); -Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, - const MutableCFOptions& mutable_opts, - std::string* opt_string); - -Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, - const MutableDBOptions& mutable_opts, - std::string* opt_string); - -Status GetMutableOptionsFromStrings( - const MutableCFOptions& base_options, - const std::unordered_map& options_map, - Logger* info_log, MutableCFOptions* new_options); - -Status GetMutableDBOptionsFromStrings( - const MutableDBOptions& base_options, - const std::unordered_map& options_map, - MutableDBOptions* new_options); bool ParseSliceTransform( const std::string& value, diff --git a/options/options_parser.cc b/options/options_parser.cc index e5f6106ab05..42cde218aad 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -13,7 +13,7 @@ #include #include -#include "file/read_write_util.h" +#include "file/line_file_reader.h" #include "file/writable_file_writer.h" #include "options/cf_options.h" #include "options/db_options.h" @@ -262,22 +262,17 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in, if (!s.ok()) { return s; } - SequentialFileReader sf_reader(std::move(seq_file), file_name, - config_options.file_readahead_size); + LineFileReader lf_reader(std::move(seq_file), file_name, + config_options.file_readahead_size); OptionSection section = kOptionSectionUnknown; std::string title; std::string argument; std::unordered_map opt_map; - std::istringstream iss; std::string line; - bool has_data = true; // we only support single-lined statement. - for (int line_num = 1; ReadOneLine(&iss, &sf_reader, &line, &has_data, &s); - ++line_num) { - if (!s.ok()) { - return s; - } + while (lf_reader.ReadLine(&line)) { + int line_num = static_cast(lf_reader.GetLineNumber()); line = TrimAndRemoveComment(line); if (line.empty()) { continue; @@ -313,6 +308,10 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in, opt_map.insert({name, value}); } } + s = lf_reader.GetStatus(); + if (!s.ok()) { + return s; + } s = EndSection(config_options, section, title, argument, opt_map); opt_map.clear(); diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 70bbb5123c3..e5cdce36dca 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -158,6 +158,9 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { *bbto, "cache_index_and_filter_blocks=1;" "cache_index_and_filter_blocks_with_high_priority=true;" + "metadata_cache_options={top_level_index_pinning=kFallback;" + "partition_pinning=kAll;" + "unpartitioned_pinning=kFlushedAndSimilar;};" "pin_l0_filter_and_index_blocks_in_cache=1;" "pin_top_level_index_and_filter=1;" "index_type=kHashSearch;" @@ -176,7 +179,9 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "hash_index_allow_collision=false;" "verify_compression=true;read_amp_bytes_per_bit=0;" "enable_index_compression=false;" - "block_align=true", + "block_align=true;" + "max_auto_readahead_size=0;" + "prepopulate_block_cache=kDisable", new_bbto)); ASSERT_EQ(unset_bytes_base, @@ -223,6 +228,11 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { {offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)}, {offsetof(struct DBOptions, file_checksum_gen_factory), sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, db_host_id), sizeof(std::string)}, + {offsetof(struct DBOptions, checksum_handoff_file_types), + sizeof(FileTypeSet)}, + {offsetof(struct DBOptions, compaction_service), + sizeof(std::shared_ptr)}, }; char* options_ptr = new char[sizeof(DBOptions)]; @@ -278,6 +288,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "skip_log_error_on_recovery=true;" "writable_file_max_buffer_size=1048576;" "paranoid_checks=true;" + "flush_verify_memtable_count=true;" + "track_and_verify_wals_in_manifest=true;" "is_fd_close_on_exec=false;" "bytes_per_sync=4295013613;" "strict_bytes_per_sync=true;" @@ -330,7 +342,9 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "write_dbid_to_manifest=false;" "best_efforts_recovery=false;" "max_bgerror_resume_count=2;" - "bgerror_resume_retry_interval=1000000", + "bgerror_resume_retry_interval=1000000" + "db_host_id=hostname;" + "allow_data_in_errors=false", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), @@ -400,14 +414,14 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { // Count padding bytes by setting all bytes in the memory to a special char, // copy a well constructed struct to this memory and see how many special // bytes left. - ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions(); FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions), kColumnFamilyOptionsExcluded); - // It based on the behavior of compiler that padding bytes are not changed - // when copying the struct. It's prone to failure when compiler behavior - // changes. We verify there is unset bytes to detect the case. - *options = ColumnFamilyOptions(); + // Invoke a user-defined constructor in the hope that it does not overwrite + // padding bytes. Note that previously we relied on the implicitly-defined + // copy-assignment operator (i.e., `*options = ColumnFamilyOptions();`) here, + // which did in fact modify padding bytes. + ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions(); // Deprecatd option which is not initialized. Need to set it to avoid // Valgrind error @@ -433,6 +447,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { options->max_mem_compaction_level = 0; options->compaction_filter = nullptr; options->sst_partitioner_factory = nullptr; + options->bottommost_temperature = Temperature::kUnknown; char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)]; ColumnFamilyOptions* new_options = @@ -465,8 +480,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "max_bytes_for_level_multiplier=60;" "memtable_factory=SkipListFactory;" "compression=kNoCompression;" - "compression_opts=5:6:7:8:9:true;" - "bottommost_compression_opts=4:5:6:7:8:true;" + "compression_opts=5:6:7:8:9:10:true:11;" + "bottommost_compression_opts=4:5:6:7:8:9:true:10;" "bottommost_compression=kDisableCompressionOption;" "level0_stop_writes_trigger=33;" "num_levels=99;" @@ -500,6 +515,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "min_blob_size=256;" "blob_file_size=1000000;" "blob_compression_type=kBZip2Compression;" + "enable_blob_garbage_collection=true;" + "blob_garbage_collection_age_cutoff=0.5;" "compaction_options_fifo={max_table_files_size=3;allow_" "compaction=false;};", new_options)); diff --git a/options/options_test.cc b/options/options_test.cc index 33925d7ed44..93f74c4c62e 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -102,6 +102,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"min_blob_size", "1K"}, {"blob_file_size", "1G"}, {"blob_compression_type", "kZSTD"}, + {"enable_blob_garbage_collection", "true"}, + {"blob_garbage_collection_age_cutoff", "0.5"}, }; std::unordered_map db_options_map = { @@ -109,6 +111,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"create_missing_column_families", "true"}, {"error_if_exists", "false"}, {"paranoid_checks", "true"}, + {"track_and_verify_wals_in_manifest", "true"}, {"max_open_files", "32"}, {"max_total_wal_size", "33"}, {"use_fsync", "true"}, @@ -230,6 +233,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10); ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30); ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); + ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, @@ -263,6 +268,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.create_missing_column_families, true); ASSERT_EQ(new_db_opt.error_if_exists, false); ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true); ASSERT_EQ(new_db_opt.max_open_files, 32); ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast(33)); ASSERT_EQ(new_db_opt.use_fsync, true); @@ -719,12 +725,18 @@ TEST_F(OptionsTest, CompressionOptionsFromString) { ASSERT_OK(GetColumnFamilyOptionsFromString( ignore, ColumnFamilyOptions(), "compression_opts=5:6:7:8:9:x:false", &base_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString( + ASSERT_OK(GetColumnFamilyOptionsFromString( config_options, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8", &base_cf_opt)); ASSERT_OK(GetColumnFamilyOptionsFromString( ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8", &base_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, ColumnFamilyOptions(), + "compression_opts=1:2:3:4:5:6:true:8:9", &base_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8:9", + &base_cf_opt)); ASSERT_NOK(GetColumnFamilyOptionsFromString( config_options, ColumnFamilyOptions(), "compression_opts={unknown=bad;}", &base_cf_opt)); @@ -785,6 +797,7 @@ TEST_F(OptionsTest, OldInterfaceTest) { {"create_missing_column_families", "true"}, {"error_if_exists", "false"}, {"paranoid_checks", "true"}, + {"track_and_verify_wals_in_manifest", "true"}, {"max_open_files", "32"}, }; ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); @@ -792,6 +805,7 @@ TEST_F(OptionsTest, OldInterfaceTest) { ASSERT_EQ(new_db_opt.create_missing_column_families, true); ASSERT_EQ(new_db_opt.error_if_exists, false); ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true); ASSERT_EQ(new_db_opt.max_open_files, 32); db_options_map["unknown_option"] = "1"; Status s = GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt); @@ -859,10 +873,11 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.format_version, 5U); ASSERT_EQ(new_opt.whole_key_filtering, true); ASSERT_TRUE(new_opt.filter_policy != nullptr); - const BloomFilterPolicy& bfp = - dynamic_cast(*new_opt.filter_policy); - EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567); - EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5); + const BloomFilterPolicy* bfp = + dynamic_cast(new_opt.filter_policy.get()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 4567); + EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kAutoBloom); // Verify that only the lower 32bits are stored in // new_opt.read_amp_bytes_per_bit. EXPECT_EQ(1U, new_opt.read_amp_bytes_per_bit); @@ -919,6 +934,23 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy); + // Ribbon filter policy + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, "filter_policy=ribbonfilter:5.678;", + &new_opt)); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + bfp = dynamic_cast(new_opt.filter_policy.get()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 5678); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon); + // Old name + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, "filter_policy=experimental_ribbon:6.789;", + &new_opt)); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + bfp = dynamic_cast(new_opt.filter_policy.get()); + EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789); + EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon); + // Check block cache options are overwritten when specified // in new format as a struct. ASSERT_OK(GetBlockBasedTableOptionsFromString( @@ -1274,6 +1306,77 @@ TEST_F(OptionsTest, OptionsComposeDecompose) { delete new_cf_opts.compaction_filter; } +TEST_F(OptionsTest, DBOptionsComposeImmutable) { + // Build a DBOptions from an Immutable/Mutable one and verify that + // we get same constituent options. + ConfigOptions config_options; + Random rnd(301); + DBOptions base_opts, new_opts; + test::RandomInitDBOptions(&base_opts, &rnd); + MutableDBOptions m_opts(base_opts); + ImmutableDBOptions i_opts(base_opts); + new_opts = BuildDBOptions(i_opts, m_opts); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_opts, + new_opts)); +} + +TEST_F(OptionsTest, GetMutableDBOptions) { + Random rnd(228); + DBOptions base_opts; + std::string opts_str; + std::unordered_map opts_map; + ConfigOptions config_options; + + test::RandomInitDBOptions(&base_opts, &rnd); + ImmutableDBOptions i_opts(base_opts); + MutableDBOptions m_opts(base_opts); + MutableDBOptions new_opts; + ASSERT_OK(GetStringFromMutableDBOptions(config_options, m_opts, &opts_str)); + ASSERT_OK(StringToMap(opts_str, &opts_map)); + ASSERT_OK(GetMutableDBOptionsFromStrings(m_opts, opts_map, &new_opts)); + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions( + config_options, base_opts, BuildDBOptions(i_opts, new_opts))); +} + +TEST_F(OptionsTest, CFOptionsComposeImmutable) { + // Build a DBOptions from an Immutable/Mutable one and verify that + // we get same constituent options. + ConfigOptions config_options; + Random rnd(301); + ColumnFamilyOptions base_opts, new_opts; + DBOptions dummy; // Needed to create ImmutableCFOptions + test::RandomInitCFOptions(&base_opts, dummy, &rnd); + MutableCFOptions m_opts(base_opts); + ImmutableCFOptions i_opts(base_opts); + UpdateColumnFamilyOptions(i_opts, &new_opts); + UpdateColumnFamilyOptions(m_opts, &new_opts); + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opts, + new_opts)); + delete new_opts.compaction_filter; +} + +TEST_F(OptionsTest, GetMutableCFOptions) { + Random rnd(228); + ColumnFamilyOptions base, copy; + std::string opts_str; + std::unordered_map opts_map; + ConfigOptions config_options; + DBOptions dummy; // Needed to create ImmutableCFOptions + + test::RandomInitCFOptions(&base, dummy, &rnd); + ColumnFamilyOptions result; + MutableCFOptions m_opts(base), new_opts; + + ASSERT_OK(GetStringFromMutableCFOptions(config_options, m_opts, &opts_str)); + ASSERT_OK(StringToMap(opts_str, &opts_map)); + ASSERT_OK(GetMutableOptionsFromStrings(m_opts, opts_map, nullptr, &new_opts)); + UpdateColumnFamilyOptions(ImmutableCFOptions(base), ©); + UpdateColumnFamilyOptions(new_opts, ©); + + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base, copy)); + delete copy.compaction_filter; +} + TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) { Options options; ColumnFamilyOptions base_opt, new_opt; @@ -1353,6 +1456,7 @@ TEST_F(OptionsTest, MutableTableOptions) { ASSERT_EQ(bbto->block_size, 1024); ASSERT_OK(bbtf->PrepareOptions(config_options)); ASSERT_TRUE(bbtf->IsPrepared()); + config_options.mutable_options_only = true; ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024")); ASSERT_EQ(bbto->block_align, true); ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_align", "false")); @@ -1372,6 +1476,79 @@ TEST_F(OptionsTest, MutableTableOptions) { ASSERT_EQ(bbto->block_size, 8192); } +TEST_F(OptionsTest, MutableCFOptions) { + ConfigOptions config_options; + ColumnFamilyOptions cf_opts; + + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, cf_opts, + "paranoid_file_checks=true; block_based_table_factory.block_align=false; " + "block_based_table_factory.block_size=8192;", + &cf_opts)); + ASSERT_TRUE(cf_opts.paranoid_file_checks); + ASSERT_NE(cf_opts.table_factory.get(), nullptr); + const auto bbto = cf_opts.table_factory->GetOptions(); + ASSERT_NE(bbto, nullptr); + ASSERT_EQ(bbto->block_size, 8192); + ASSERT_EQ(bbto->block_align, false); + std::unordered_map unused_opts; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts)); + ASSERT_EQ(cf_opts.paranoid_file_checks, false); + + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.block_size", "16384"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 16384); + + config_options.mutable_options_only = true; + // Force consistency checks is not mutable + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"force_consistency_checks", "true"}}, + &cf_opts)); + + // Attempt to change the table. It is not mutable, so this should fail and + // leave the original intact + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"table_factory", "PlainTable"}}, &cf_opts)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"table_factory.id", "PlainTable"}}, &cf_opts)); + ASSERT_NE(cf_opts.table_factory.get(), nullptr); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + + // Change the block size. Should update the value in the current table + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.block_size", "8192"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 8192); + + // Attempt to turn off block cache fails, as this option is not mutable + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.no_block_cache", "true"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + + // Attempt to change the block size via a config string/map. Should update + // the current value + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 32768); + + // Attempt to change the block size and no cache through the map. Should + // fail, leaving the old values intact + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory", + "{block_size=16384; no_block_cache=true}"}}, + &cf_opts)); + ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_EQ(bbto->block_size, 32768); +} + #endif // !ROCKSDB_LITE Status StringToMap( @@ -1564,6 +1741,94 @@ TEST_F(OptionsTest, GetStringFromCompressionType) { ASSERT_NOK( GetStringFromCompressionType(&res, static_cast(-10))); } + +TEST_F(OptionsTest, OnlyMutableDBOptions) { + std::string opt_str; + Random rnd(302); + ConfigOptions cfg_opts; + DBOptions db_opts; + DBOptions mdb_opts; + std::unordered_set m_names; + std::unordered_set a_names; + + test::RandomInitDBOptions(&db_opts, &rnd); + auto db_config = DBOptionsAsConfigurable(db_opts); + + // Get all of the DB Option names (mutable or not) + ASSERT_OK(db_config->GetOptionNames(cfg_opts, &a_names)); + + // Get only the mutable options from db_opts and set those in mdb_opts + cfg_opts.mutable_options_only = true; + + // Get only the Mutable DB Option names + ASSERT_OK(db_config->GetOptionNames(cfg_opts, &m_names)); + ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opt_str)); + ASSERT_OK(GetDBOptionsFromString(cfg_opts, mdb_opts, opt_str, &mdb_opts)); + std::string mismatch; + // Comparing only the mutable options, the two are equivalent + auto mdb_config = DBOptionsAsConfigurable(mdb_opts); + ASSERT_TRUE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch)); + ASSERT_TRUE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch)); + + ASSERT_GT(a_names.size(), m_names.size()); + for (const auto& n : m_names) { + std::string m, d; + ASSERT_OK(mdb_config->GetOption(cfg_opts, n, &m)); + ASSERT_OK(db_config->GetOption(cfg_opts, n, &d)); + ASSERT_EQ(m, d); + } + + cfg_opts.mutable_options_only = false; + // Comparing all of the options, the two are not equivalent + ASSERT_FALSE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch)); + ASSERT_FALSE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch)); +} + +TEST_F(OptionsTest, OnlyMutableCFOptions) { + std::string opt_str; + Random rnd(302); + ConfigOptions cfg_opts; + DBOptions db_opts; + ColumnFamilyOptions mcf_opts; + ColumnFamilyOptions cf_opts; + std::unordered_set m_names; + std::unordered_set a_names; + + test::RandomInitCFOptions(&cf_opts, db_opts, &rnd); + auto cf_config = CFOptionsAsConfigurable(cf_opts); + + // Get all of the CF Option names (mutable or not) + ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &a_names)); + + // Get only the mutable options from cf_opts and set those in mcf_opts + cfg_opts.mutable_options_only = true; + // Get only the Mutable CF Option names + ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &m_names)); + ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opt_str)); + ASSERT_OK( + GetColumnFamilyOptionsFromString(cfg_opts, mcf_opts, opt_str, &mcf_opts)); + std::string mismatch; + + auto mcf_config = CFOptionsAsConfigurable(mcf_opts); + // Comparing only the mutable options, the two are equivalent + ASSERT_TRUE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch)); + ASSERT_TRUE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch)); + + ASSERT_GT(a_names.size(), m_names.size()); + for (const auto& n : m_names) { + std::string m, d; + ASSERT_OK(mcf_config->GetOption(cfg_opts, n, &m)); + ASSERT_OK(cf_config->GetOption(cfg_opts, n, &d)); + ASSERT_EQ(m, d); + } + + cfg_opts.mutable_options_only = false; + // Comparing all of the options, the two are not equivalent + ASSERT_FALSE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch)); + ASSERT_FALSE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch)); + + delete cf_opts.compaction_filter; +} #endif // !ROCKSDB_LITE TEST_F(OptionsTest, ConvertOptionsTest) { @@ -1591,6 +1856,23 @@ TEST_F(OptionsTest, ConvertOptionsTest) { } #ifndef ROCKSDB_LITE +const static std::string kCustomEnvName = "Custom"; +const static std::string kCustomEnvProp = "env=" + kCustomEnvName; +class CustomEnv : public EnvWrapper { + public: + explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} +}; + +static int RegisterCustomEnv(ObjectLibrary& library, const std::string& arg) { + library.Register( + arg, [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { + static CustomEnv env(Env::Default()); + return &env; + }); + return 1; +} + // This test suite tests the old APIs into the Configure options methods. // Once those APIs are officially deprecated, this test suite can be deleted. class OptionsOldApiTest : public testing::Test {}; @@ -1653,6 +1935,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"min_blob_size", "1K"}, {"blob_file_size", "1G"}, {"blob_compression_type", "kZSTD"}, + {"enable_blob_garbage_collection", "true"}, + {"blob_garbage_collection_age_cutoff", "0.5"}, }; std::unordered_map db_options_map = { @@ -1660,6 +1944,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"create_missing_column_families", "true"}, {"error_if_exists", "false"}, {"paranoid_checks", "true"}, + {"track_and_verify_wals_in_manifest", "true"}, {"max_open_files", "32"}, {"max_total_wal_size", "33"}, {"use_fsync", "true"}, @@ -1773,6 +2058,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10); ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30); ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); + ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap( @@ -1808,6 +2095,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.create_missing_column_families, true); ASSERT_EQ(new_db_opt.error_if_exists, false); ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true); ASSERT_EQ(new_db_opt.max_open_files, 32); ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast(33)); ASSERT_EQ(new_db_opt.use_fsync, true); @@ -2284,14 +2572,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) { NewBlockBasedTableFactory(block_based_table_options)); // Register an Env with object registry. - const static char* kCustomEnvName = "CustomEnv"; - class CustomEnv : public EnvWrapper { - public: - explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} - }; - ObjectLibrary::Default()->Register( - kCustomEnvName, + "CustomEnvDefault", [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, std::string* /* errmsg */) { static CustomEnv env(Env::Default()); @@ -2305,7 +2587,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) { "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;" "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files=" "1;" - "rate_limiter_bytes_per_sec=1024;env=CustomEnv", + "rate_limiter_bytes_per_sec=1024;env=CustomEnvDefault", &new_options)); ASSERT_EQ(new_options.compression_opts.window_bits, 4); @@ -2339,7 +2621,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) { ASSERT_EQ(new_options.max_open_files, 1); ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); Env* newEnv = new_options.env; - ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv)); + ASSERT_OK(Env::LoadEnv("CustomEnvDefault", &newEnv)); ASSERT_EQ(newEnv, new_options.env); } @@ -2390,14 +2672,10 @@ TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) { #ifndef ROCKSDB_LITE class OptionsParserTest : public testing::Test { public: - OptionsParserTest() { - env_.reset(new test::StringEnv(Env::Default())); - fs_.reset(new LegacyFileSystemWrapper(env_.get())); - } + OptionsParserTest() { fs_.reset(new test::StringFS(FileSystem::Default())); } protected: - std::unique_ptr env_; - std::unique_ptr fs_; + std::shared_ptr fs_; }; TEST_F(OptionsParserTest, Comment) { @@ -2426,7 +2704,7 @@ TEST_F(OptionsParserTest, Comment) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_OK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2457,7 +2735,7 @@ TEST_F(OptionsParserTest, ExtraSpace) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_OK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2475,7 +2753,7 @@ TEST_F(OptionsParserTest, MissingDBOptions) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2505,7 +2783,7 @@ TEST_F(OptionsParserTest, DoubleDBOptions) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2533,7 +2811,7 @@ TEST_F(OptionsParserTest, NoDefaultCFOptions) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2563,7 +2841,7 @@ TEST_F(OptionsParserTest, DefaultCFOptionsMustBeTheFirst) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2592,7 +2870,7 @@ TEST_F(OptionsParserTest, DuplicateCFOptions) { "[CFOptions \"something_else\"]\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK( parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2660,12 +2938,12 @@ TEST_F(OptionsParserTest, IgnoreUnknownOptions) { " # if a section is blank, we will use the default\n"; const std::string kTestFileName = "test-rocksdb-options.ini"; - auto s = env_->FileExists(kTestFileName); + auto s = fs_->FileExists(kTestFileName, IOOptions(), nullptr); ASSERT_TRUE(s.ok() || s.IsNotFound()); if (s.ok()) { - ASSERT_OK(env_->DeleteFile(kTestFileName)); + ASSERT_OK(fs_->DeleteFile(kTestFileName, IOOptions(), nullptr)); } - ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content)); + ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content)); RocksDBOptionsParser parser; ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */)); @@ -2713,7 +2991,7 @@ TEST_F(OptionsParserTest, ParseVersion) { snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str()); parser.Reset(); - ASSERT_OK(env_->WriteToNewFile(iv, buffer)); + ASSERT_OK(fs_->WriteToNewFile(iv, buffer)); ASSERT_NOK(parser.Parse(iv, fs_.get(), false, 0 /* readahead_size */)); } @@ -2722,7 +3000,7 @@ TEST_F(OptionsParserTest, ParseVersion) { for (auto vv : valid_versions) { snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str()); parser.Reset(); - ASSERT_OK(env_->WriteToNewFile(vv, buffer)); + ASSERT_OK(fs_->WriteToNewFile(vv, buffer)); ASSERT_OK(parser.Parse(vv, fs_.get(), false, 0 /* readahead_size */)); } } @@ -2831,37 +3109,37 @@ TEST_F(OptionsParserTest, Readahead) { kOptionsFileName, fs_.get())); uint64_t file_size = 0; - ASSERT_OK(env_->GetFileSize(kOptionsFileName, &file_size)); + ASSERT_OK( + fs_->GetFileSize(kOptionsFileName, IOOptions(), &file_size, nullptr)); assert(file_size > 0); RocksDBOptionsParser parser; - env_->num_seq_file_read_ = 0; + fs_->num_seq_file_read_ = 0; size_t readahead_size = 128 * 1024; ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size)); - ASSERT_EQ(env_->num_seq_file_read_.load(), + ASSERT_EQ(fs_->num_seq_file_read_.load(), (file_size - 1) / readahead_size + 1); - env_->num_seq_file_read_.store(0); + fs_->num_seq_file_read_.store(0); readahead_size = 1024 * 1024; ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size)); - ASSERT_EQ(env_->num_seq_file_read_.load(), + ASSERT_EQ(fs_->num_seq_file_read_.load(), (file_size - 1) / readahead_size + 1); // Tiny readahead. 8 KB is read each time. - env_->num_seq_file_read_.store(0); + fs_->num_seq_file_read_.store(0); ASSERT_OK( parser.Parse(kOptionsFileName, fs_.get(), false, 1 /* readahead_size */)); - ASSERT_GE(env_->num_seq_file_read_.load(), file_size / (8 * 1024)); - ASSERT_LT(env_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2); + ASSERT_GE(fs_->num_seq_file_read_.load(), file_size / (8 * 1024)); + ASSERT_LT(fs_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2); // Disable readahead means 512KB readahead. - env_->num_seq_file_read_.store(0); + fs_->num_seq_file_read_.store(0); ASSERT_OK( parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */)); - ASSERT_GE(env_->num_seq_file_read_.load(), - (file_size - 1) / (512 * 1024) + 1); + ASSERT_GE(fs_->num_seq_file_read_.load(), (file_size - 1) / (512 * 1024) + 1); } TEST_F(OptionsParserTest, DumpAndParse) { @@ -3059,7 +3337,7 @@ class OptionsSanityCheckTest : public OptionsParserTest { } Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) { - Status s = env_->DeleteFile(kOptionsFileName); + Status s = fs_->DeleteFile(kOptionsFileName, IOOptions(), nullptr); if (!s.ok()) { return s; } @@ -3437,8 +3715,8 @@ TEST_F(OptionTypeInfoTest, TestInvalidArgs) { OptionVerificationType::kNormal, OptionTypeFlags::kNone, [](const ConfigOptions&, const std::string&, - const std::string& value, char* addr) { - auto ptr = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto ptr = static_cast(addr); *ptr = ParseInt(value); return Status::OK(); }); @@ -3451,8 +3729,8 @@ TEST_F(OptionTypeInfoTest, TestParseFunc) { 0, OptionType::kUnknown, OptionVerificationType::kNormal, OptionTypeFlags::kNone, [](const ConfigOptions& /*opts*/, const std::string& name, - const std::string& value, char* addr) { - auto ptr = reinterpret_cast(addr); + const std::string& value, void* addr) { + auto ptr = static_cast(addr); if (name == "Oops") { return Status::InvalidArgument(value); } else { @@ -3472,7 +3750,7 @@ TEST_F(OptionTypeInfoTest, TestSerializeFunc) { 0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone, nullptr, [](const ConfigOptions& /*opts*/, const std::string& name, - const char* /*addr*/, std::string* value) { + const void* /*addr*/, std::string* value) { if (name == "Oops") { return Status::InvalidArgument(name); } else { @@ -3494,9 +3772,9 @@ TEST_F(OptionTypeInfoTest, TestEqualsFunc) { 0, OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone, nullptr, nullptr, [](const ConfigOptions& /*opts*/, const std::string& name, - const char* addr1, const char* addr2, std::string* mismatch) { - auto i1 = *(reinterpret_cast(addr1)); - auto i2 = *(reinterpret_cast(addr2)); + const void* addr1, const void* addr2, std::string* mismatch) { + auto i1 = *(static_cast(addr1)); + auto i2 = *(static_cast(addr2)); if (name == "LT") { return i1 < i2; } else if (name == "GT") { @@ -3550,8 +3828,7 @@ TEST_F(OptionTypeInfoTest, TestOptionFlags) { // An alias can change the value via parse, but does nothing on serialize on // match std::string result; - ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias", - reinterpret_cast(&base))); + ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias", &base)); ASSERT_OK(opt_alias.Serialize(config_options, "Alias", &base, &result)); ASSERT_TRUE( opt_alias.AreEqual(config_options, "Alias", &base, &comp, &result)); @@ -3762,6 +4039,73 @@ TEST_F(OptionTypeInfoTest, TestVectorType) { ASSERT_EQ(vec1[1], "b1|b2"); ASSERT_EQ(vec1[2], "c1|c2|{d1|d2}"); } + +TEST_F(OptionTypeInfoTest, TestStaticType) { + struct SimpleOptions { + size_t size = 0; + bool verify = true; + }; + + static std::unordered_map type_map = { + {"size", {offsetof(struct SimpleOptions, size), OptionType::kSizeT}}, + {"verify", + {offsetof(struct SimpleOptions, verify), OptionType::kBoolean}}, + }; + + ConfigOptions config_options; + SimpleOptions opts, copy; + opts.size = 12345; + opts.verify = false; + std::string str, mismatch; + + ASSERT_OK( + OptionTypeInfo::SerializeType(config_options, type_map, &opts, &str)); + ASSERT_FALSE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts, + ©, &mismatch)); + ASSERT_OK(OptionTypeInfo::ParseType(config_options, str, type_map, ©)); + ASSERT_TRUE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts, + ©, &mismatch)); +} + +class ConfigOptionsTest : public testing::Test {}; + +TEST_F(ConfigOptionsTest, EnvFromConfigOptions) { + ConfigOptions config_options; + DBOptions db_opts; + Options opts; + Env* mem_env = NewMemEnv(Env::Default()); + config_options.registry->AddLibrary("custom-env", RegisterCustomEnv, + kCustomEnvName); + + config_options.env = mem_env; + // First test that we can get the env as expected + ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(), kCustomEnvProp, + &db_opts)); + ASSERT_OK( + GetOptionsFromString(config_options, Options(), kCustomEnvProp, &opts)); + ASSERT_NE(config_options.env, db_opts.env); + ASSERT_EQ(opts.env, db_opts.env); + Env* custom_env = db_opts.env; + + // Now try a "bad" env" and check that nothing changed + config_options.ignore_unsupported_options = true; + ASSERT_OK( + GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts)); + ASSERT_OK(GetOptionsFromString(config_options, opts, "env=unknown", &opts)); + ASSERT_EQ(config_options.env, mem_env); + ASSERT_EQ(db_opts.env, custom_env); + ASSERT_EQ(opts.env, db_opts.env); + + // Now try a "bad" env" ignoring unknown objects + config_options.ignore_unsupported_options = false; + ASSERT_NOK( + GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts)); + ASSERT_EQ(config_options.env, mem_env); + ASSERT_EQ(db_opts.env, custom_env); + ASSERT_EQ(opts.env, db_opts.env); + + delete mem_env; +} #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/plugin/README.md b/plugin/README.md new file mode 100644 index 00000000000..4bae8312f8e --- /dev/null +++ b/plugin/README.md @@ -0,0 +1,25 @@ +## Building external plugins together with RocksDB + +RocksDB offers several plugin interfaces for developers to customize its behavior. One difficulty developers face is how to make their plugin available to end users. The approach discussed here involves building the external code together with the RocksDB code into a single binary. Note another approach we plan to support involves loading plugins dynamically from shared libraries. + +### Discovery + +We hope developers will mention their work in "PLUGINS.md" so users can easily discover and reuse solutions for customizing RocksDB. + +### Directory organization + +External plugins will be linked according to their name into a subdirectory of "plugin/". For example, a plugin called "dedupfs" would be linked into "plugin/dedupfs/". + +### Build standard + +Currently the only supported build system is make. In the plugin directory, files ending in the .mk extension can define the following variables. + +* `$(PLUGIN_NAME)_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files. +* `$(PLUGIN_NAME)_HEADERS`: these files will be installed in the RocksDB header directory. Their paths will be prefixed by "rocksdb/plugin/$(PLUGIN_NAME)/". +* `$(PLUGIN_NAME)_LDFLAGS`: these flags will be passed to the final link step. For example, library dependencies can be propagated here, or symbols can be forcibly included, e.g., for static registration. + +Users will run the usual make commands from the RocksDB directory, specifying the plugins to include in a space-separated list in the variable `ROCKSDB_PLUGINS`. + +### Example + +For a working example, see [Dedupfs](https://github.com/ajkr/dedupfs). diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index fbf25c313d0..c0ef19a40bd 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -38,25 +38,54 @@ static inline bool HasJemalloc() { return true; } #else +// definitions for compatibility with older versions of jemalloc +#if !defined(JEMALLOC_ALLOCATOR) +#define JEMALLOC_ALLOCATOR +#endif +#if !defined(JEMALLOC_RESTRICT_RETURN) +#define JEMALLOC_RESTRICT_RETURN +#endif +#if !defined(JEMALLOC_NOTHROW) +#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +#endif +#if !defined(JEMALLOC_ALLOC_SIZE) +#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +#else +#define JEMALLOC_ALLOC_SIZE(s) +#endif +#endif + // Declare non-standard jemalloc APIs as weak symbols. We can null-check these // symbols to detect whether jemalloc is linked with the binary. -extern "C" void* mallocx(size_t, int) __attribute__((__nothrow__, __weak__)); -extern "C" void* rallocx(void*, size_t, int) __attribute__((__nothrow__, __weak__)); -extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__nothrow__, __weak__)); -extern "C" size_t sallocx(const void*, int) __attribute__((__nothrow__, __weak__)); -extern "C" void dallocx(void*, int) __attribute__((__nothrow__, __weak__)); -extern "C" void sdallocx(void*, size_t, int) __attribute__((__nothrow__, __weak__)); -extern "C" size_t nallocx(size_t, int) __attribute__((__nothrow__, __weak__)); -extern "C" int mallctl(const char*, void*, size_t*, void*, size_t) - __attribute__((__nothrow__, __weak__)); -extern "C" int mallctlnametomib(const char*, size_t*, size_t*) - __attribute__((__nothrow__, __weak__)); -extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*, - size_t) __attribute__((__nothrow__, __weak__)); -extern "C" void malloc_stats_print(void (*)(void*, const char*), void*, - const char*) __attribute__((__nothrow__, __weak__)); -extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) - JEMALLOC_CXX_THROW __attribute__((__weak__)); +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) + __attribute__((__weak__)); +extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * +rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int) + JEMALLOC_ATTR(pure) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure) + __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *, + size_t) __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *, + size_t *) + __attribute__((__weak__)); +extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *, + size_t *, void *, size_t) + __attribute__((__weak__)); +extern "C" void JEMALLOC_NOTHROW +malloc_stats_print(void (*)(void *, const char *), void *, const char *) + __attribute__((__weak__)); +extern "C" size_t JEMALLOC_NOTHROW +malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW + __attribute__((__weak__)); // Check if Jemalloc is linked with the binary. Note the main program might be // using a different memory allocator even this method return true. diff --git a/port/lang.h b/port/lang.h index a5fd364907c..4429f105eed 100644 --- a/port/lang.h +++ b/port/lang.h @@ -14,3 +14,28 @@ #define FALLTHROUGH_INTENDED do {} while (0) #endif #endif + +#if defined(__clang__) +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // __has_feature(address_sanitizer) +#endif // defined(__has_feature) +#else // __clang__ +#ifdef __SANITIZE_ADDRESS__ +#define MUST_FREE_HEAP_ALLOCATIONS 1 +#endif // __SANITIZE_ADDRESS__ +#endif // __clang__ + +// Coding guidelines say to avoid static objects with non-trivial destructors, +// because it's easy to cause trouble (UB) in static destruction. This +// macro makes it easier to define static objects that are normally never +// destructed, except are destructed when running under ASAN. This should +// avoid unexpected, unnecessary destruction behavior in production. +// Note that constructor arguments can be provided as in +// STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2); +#ifdef MUST_FREE_HEAP_ALLOCATIONS +#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name +#else +#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type +#endif diff --git a/port/port_posix.cc b/port/port_posix.cc index 8610c6e0d5d..112984de296 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -21,11 +21,12 @@ #include #include #include -#include #include #include + #include -#include "logging/logging.h" + +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -47,7 +48,7 @@ namespace port { static int PthreadCall(const char* label, int result) { if (result != 0 && result != ETIMEDOUT) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); + fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str()); abort(); } return result; diff --git a/port/port_posix.h b/port/port_posix.h index a24c7b690e2..90f131e1310 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -167,7 +167,7 @@ static inline void AsmVolatilePause() { #if defined(__i386__) || defined(__x86_64__) asm volatile("pause"); #elif defined(__aarch64__) - asm volatile("wfe"); + asm volatile("yield"); #elif defined(__powerpc64__) asm volatile("or 27,27,27"); #endif diff --git a/port/stack_trace.cc b/port/stack_trace.cc index dee8bd9cea6..c82da2a206b 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -7,7 +7,7 @@ #if defined(ROCKSDB_LITE) || \ !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \ - defined(OS_FREEBSD) || defined(OS_SOLARIS) || defined(OS_WIN) + defined(OS_SOLARIS) || defined(OS_WIN) // noop @@ -32,6 +32,10 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #include #include +#if defined(OS_FREEBSD) +#include +#endif + namespace ROCKSDB_NAMESPACE { namespace port { @@ -41,6 +45,7 @@ namespace { const char* GetExecutableName() { static char name[1024]; +#if !defined(OS_FREEBSD) char link[1024]; snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); auto read = readlink(link, name, sizeof(name) - 1); @@ -50,6 +55,17 @@ const char* GetExecutableName() { name[read] = 0; return name; } +#else + int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; + size_t namesz = sizeof(name); + + auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0); + if (-1 == ret) { + return nullptr; + } else { + return name; + } +#endif } void PrintStackTraceLine(const char* symbol, void* frame) { @@ -144,6 +160,22 @@ static void StackTraceHandler(int sig) { fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); // skip the top three signal handler related frames PrintStack(3); + + // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of + // a signal" have failed, so just warn the user about them. +#if defined(__clang__) && defined(__has_feature) +#if __has_feature(thread_sanitizer) + fprintf(stderr, + "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" + "==> ignorable, as they are expected when generating a stack\n" + "==> trace because of a signal under TSAN. Consider why the\n" + "==> signal was generated to begin with, and the stack trace\n" + "==> in the TSAN warning can be useful for that. (The stack\n" + "==> trace printed by the signal handler is likely obscured\n" + "==> by TSAN output.)\n"); +#endif +#endif + // re-signal to default handler (so we still get core dump if needed...) raise(sig); } diff --git a/port/win/env_default.cc b/port/win/env_default.cc index 36f95fbe342..0c9958ddd9c 100644 --- a/port/win/env_default.cc +++ b/port/win/env_default.cc @@ -11,8 +11,8 @@ #include -#include #include "port/win/env_win.h" +#include "rocksdb/env.h" #include "test_util/sync_point.h" #include "util/compression_context_cache.h" #include "util/thread_local.h" diff --git a/port/win/env_win.cc b/port/win/env_win.cc index f7d8f9ce3ce..cc337c1f8c6 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -9,37 +9,37 @@ #if defined(OS_WIN) -#include "port/win/env_win.h" -#include "port/win/win_thread.h" -#include -#include -#include - +#include // _rmdir, _mkdir, _getcwd #include -#include // _access -#include // _rmdir, _mkdir, _getcwd -#include +#include // _access +#include // for uuid generation +#include #include +#include +#include -#include "rocksdb/env.h" -#include "rocksdb/slice.h" - -#include "port/port.h" -#include "port/port_dirent.h" -#include "port/win/win_logger.h" -#include "port/win/io_win.h" +#include +#include +#include #include "monitoring/iostats_context_imp.h" - #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" - -#include // for uuid generation -#include -#include +#include "port/port.h" +#include "port/port_dirent.h" +#include "port/win/env_win.h" +#include "port/win/io_win.h" +#include "port/win/win_logger.h" +#include "port/win/win_thread.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" #include "strsafe.h" +#include "util/string_util.h" -#include +// Undefine the functions windows might use (again)... +#undef GetCurrentTime +#undef DeleteFile +#undef LoadLibrary namespace ROCKSDB_NAMESPACE { @@ -61,29 +61,19 @@ typedef std::unique_ptr UniqueFindClosePtr; void WinthreadCall(const char* label, std::error_code result) { if (0 != result.value()) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value())); + fprintf(stderr, "Winthread %s: %s\n", label, + errnoStr(result.value()).c_str()); abort(); } } -} +} // namespace namespace port { - -WinEnvIO::WinEnvIO(Env* hosted_env) - : hosted_env_(hosted_env), - page_size_(4 * 1024), - allocation_granularity_(page_size_), - perf_counter_frequency_(0), +WinClock::WinClock() + : perf_counter_frequency_(0), nano_seconds_per_period_(0), GetSystemTimePreciseAsFileTime_(NULL) { - - SYSTEM_INFO sinfo; - GetSystemInfo(&sinfo); - - page_size_ = sinfo.dwPageSize; - allocation_granularity_ = sinfo.dwAllocationGranularity; - { LARGE_INTEGER qpf; BOOL ret __attribute__((__unused__)); @@ -98,39 +88,91 @@ WinEnvIO::WinEnvIO(Env* hosted_env) HMODULE module = GetModuleHandle("kernel32.dll"); if (module != NULL) { - GetSystemTimePreciseAsFileTime_ = - (FnGetSystemTimePreciseAsFileTime)GetProcAddress( - module, "GetSystemTimePreciseAsFileTime"); + GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)( + void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime"); } } -WinEnvIO::~WinEnvIO() { +void WinClock::SleepForMicroseconds(int micros) { + std::this_thread::sleep_for(std::chrono::microseconds(micros)); } -Status WinEnvIO::DeleteFile(const std::string& fname) { - Status result; +std::string WinClock::TimeToString(uint64_t secondsSince1970) { + std::string result; - BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + const time_t seconds = secondsSince1970; + const int maxsize = 64; - if(!ret) { - auto lastError = GetLastError(); - result = IOErrorFromWindowsError("Failed to delete: " + fname, - lastError); + struct tm t; + errno_t ret = localtime_s(&t, &seconds); + + if (ret) { + result = std::to_string(seconds); + } else { + result.resize(maxsize); + char* p = &result[0]; + + int len = + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + assert(len > 0); + + result.resize(len); } return result; } -Status WinEnvIO::Truncate(const std::string& fname, size_t size) { - Status s; - int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); - if (result != 0) { - s = IOError("Failed to truncate: " + fname, errno); +uint64_t WinClock::NowMicros() { + if (GetSystemTimePreciseAsFileTime_ != NULL) { + // all std::chrono clocks on windows proved to return + // values that may repeat that is not good enough for some uses. + const int64_t c_UnixEpochStartTicks = 116444736000000000LL; + const int64_t c_FtToMicroSec = 10; + + // This interface needs to return system time and not + // just any microseconds because it is often used as an argument + // to TimedWait() on condition variable + FILETIME ftSystemTime; + GetSystemTimePreciseAsFileTime_(&ftSystemTime); + + LARGE_INTEGER li; + li.LowPart = ftSystemTime.dwLowDateTime; + li.HighPart = ftSystemTime.dwHighDateTime; + // Subtract unix epoch start + li.QuadPart -= c_UnixEpochStartTicks; + // Convert to microsecs + li.QuadPart /= c_FtToMicroSec; + return li.QuadPart; } - return s; + using namespace std::chrono; + return duration_cast(system_clock::now().time_since_epoch()) + .count(); +} + +uint64_t WinClock::NowNanos() { + if (nano_seconds_per_period_ != 0) { + // all std::chrono clocks on windows have the same resolution that is only + // good enough for microseconds but not nanoseconds + // On Windows 8 and Windows 2012 Server + // GetSystemTimePreciseAsFileTime(¤t_time) can be used + LARGE_INTEGER li; + QueryPerformanceCounter(&li); + // Convert performance counter to nanoseconds by precomputed ratio. + // Directly multiply nano::den with li.QuadPart causes overflow. + // Only do this when nano::den is divisible by perf_counter_frequency_, + // which most likely is the case in reality. If it's not, fall back to + // high_resolution_clock, which may be less precise under old compilers. + li.QuadPart *= nano_seconds_per_period_; + return li.QuadPart; + } + using namespace std::chrono; + return duration_cast( + high_resolution_clock::now().time_since_epoch()) + .count(); } -Status WinEnvIO::GetCurrentTime(int64_t* unix_time) { +Status WinClock::GetCurrentTime(int64_t* unix_time) { time_t time = std::time(nullptr); if (time == (time_t)(-1)) { return Status::NotSupported("Failed to get time"); @@ -140,10 +182,55 @@ Status WinEnvIO::GetCurrentTime(int64_t* unix_time) { return Status::OK(); } -Status WinEnvIO::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - Status s; +WinFileSystem::WinFileSystem(const std::shared_ptr& clock) + : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) { + SYSTEM_INFO sinfo; + GetSystemInfo(&sinfo); + + page_size_ = sinfo.dwPageSize; + allocation_granularity_ = sinfo.dwAllocationGranularity; +} + +const std::shared_ptr& WinFileSystem::Default() { + static std::shared_ptr fs = + std::make_shared(WinClock::Default()); + return fs; +} + +WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {} + +WinEnvIO::~WinEnvIO() {} + +IOStatus WinFileSystem::DeleteFile(const std::string& fname, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; + + BOOL ret = RX_DeleteFile(RX_FN(fname).c_str()); + + if (!ret) { + auto lastError = GetLastError(); + result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError); + } + + return result; +} + +IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size); + if (result != 0) { + s = IOError("Failed to truncate: " + fname, errno); + } + return s; +} + +IOStatus WinFileSystem::NewSequentialFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + IOStatus s; result->reset(); @@ -177,11 +264,11 @@ Status WinEnvIO::NewSequentialFile(const std::string& fname, return s; } -Status WinEnvIO::NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { +IOStatus WinFileSystem::NewRandomAccessFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* dbg) { result->reset(); - Status s; + IOStatus s; // Open the file for read-only random access // Random access is to disable read-ahead as the system reads too much data @@ -198,10 +285,10 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname, HANDLE hFile = 0; { IOSTATS_TIMER_GUARD(open_nanos); - hFile = RX_CreateFile( - RX_FN(fname).c_str(), GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, OPEN_EXISTING, fileFlags, NULL); + hFile = + RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, + NULL, OPEN_EXISTING, fileFlags, NULL); } if (INVALID_HANDLE_VALUE == hFile) { @@ -217,13 +304,13 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname, // Use mmap when virtual address-space is plentiful. uint64_t fileSize; - s = GetFileSize(fname, &fileSize); + s = GetFileSize(fname, IOOptions(), &fileSize, dbg); if (s.ok()) { // Will not map empty files if (fileSize == 0) { - return IOError( - "NewRandomAccessFile failed to map empty file: " + fname, EINVAL); + return IOError("NewRandomAccessFile failed to map empty file: " + fname, + EINVAL); } HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY, @@ -241,11 +328,11 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname, UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc); const void* mapped_region = - MapViewOfFileEx(hMap, FILE_MAP_READ, - 0, // High DWORD of access start - 0, // Low DWORD - static_cast(fileSize), - NULL); // Let the OS choose the mapping + MapViewOfFileEx(hMap, FILE_MAP_READ, + 0, // High DWORD of access start + 0, // Low DWORD + static_cast(fileSize), + NULL); // Let the OS choose the mapping if (!mapped_region) { auto lastError = GetLastError(); @@ -261,26 +348,22 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname, fileGuard.release(); } } else { - result->reset(new WinRandomAccessFile(fname, hFile, - std::max(GetSectorSize(fname), - page_size_), - options)); + result->reset(new WinRandomAccessFile( + fname, hFile, std::max(GetSectorSize(fname), page_size_), options)); fileGuard.release(); } return s; } -Status WinEnvIO::OpenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options, - bool reopen) { - +IOStatus WinFileSystem::OpenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, bool reopen) { const size_t c_BufferCapacity = 64 * 1024; EnvOptions local_options(options); result->reset(); - Status s; + IOStatus s; DWORD fileFlags = FILE_ATTRIBUTE_NORMAL; @@ -317,11 +400,11 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname, RX_FN(fname).c_str(), desired_access, // Access desired shared_mode, - NULL, // Security attributes + NULL, // Security attributes // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC creation_disposition, - fileFlags, // Flags - NULL); // Template File + fileFlags, // Flags + NULL); // Template File } if (INVALID_HANDLE_VALUE == hFile) { @@ -351,25 +434,37 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname, } else { // Here we want the buffer allocation to be aligned by the SSD page size // and to be a multiple of it - result->reset(new WinWritableFile(fname, hFile, - std::max(GetSectorSize(fname), - GetPageSize()), - c_BufferCapacity, local_options)); + result->reset(new WinWritableFile( + fname, hFile, std::max(GetSectorSize(fname), GetPageSize()), + c_BufferCapacity, local_options)); } return s; } -Status WinEnvIO::NewRandomRWFile(const std::string & fname, - std::unique_ptr* result, - const EnvOptions & options) { +IOStatus WinFileSystem::NewWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, false); +} - Status s; +IOStatus WinFileSystem::ReopenWritableFile( + const std::string& fname, const FileOptions& options, + std::unique_ptr* result, IODebugContext* /*dbg*/) { + return OpenWritableFile(fname, options, result, true); +} + +IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; // Open the file for read-only random access // Random access is to disable read-ahead as the system reads too much data DWORD desired_access = GENERIC_READ | GENERIC_WRITE; DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; - DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist + DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist DWORD file_flags = FILE_FLAG_RANDOM_ACCESS; if (options.use_direct_reads && options.use_direct_writes) { @@ -381,36 +476,28 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname, HANDLE hFile = 0; { IOSTATS_TIMER_GUARD(open_nanos); - hFile = - RX_CreateFile(RX_FN(fname).c_str(), - desired_access, - shared_mode, - NULL, // Security attributes - creation_disposition, - file_flags, - NULL); + hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode, + NULL, // Security attributes + creation_disposition, file_flags, NULL); } if (INVALID_HANDLE_VALUE == hFile) { auto lastError = GetLastError(); return IOErrorFromWindowsError( - "NewRandomRWFile failed to Create/Open: " + fname, lastError); + "NewRandomRWFile failed to Create/Open: " + fname, lastError); } UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); - result->reset(new WinRandomRWFile(fname, hFile, - std::max(GetSectorSize(fname), - GetPageSize()), - options)); + result->reset(new WinRandomRWFile( + fname, hFile, std::max(GetSectorSize(fname), GetPageSize()), options)); fileGuard.release(); return s; } -Status WinEnvIO::NewMemoryMappedFileBuffer( - const std::string & fname, - std::unique_ptr* result) { - Status s; +IOStatus WinFileSystem::NewMemoryMappedFileBuffer( + const std::string& fname, std::unique_ptr* result) { + IOStatus s; result->reset(); DWORD fileFlags = FILE_ATTRIBUTE_READONLY; @@ -420,11 +507,9 @@ Status WinEnvIO::NewMemoryMappedFileBuffer( IOSTATS_TIMER_GUARD(open_nanos); hFile = RX_CreateFile( RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE, - FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, - NULL, + FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL, OPEN_EXISTING, // Open only if it exists - fileFlags, - NULL); + fileFlags, NULL); } if (INVALID_HANDLE_VALUE == hFile) { @@ -436,21 +521,21 @@ Status WinEnvIO::NewMemoryMappedFileBuffer( UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc); uint64_t fileSize = 0; - s = GetFileSize(fname, &fileSize); + s = GetFileSize(fname, IOOptions(), &fileSize, nullptr); if (!s.ok()) { return s; } // Will not map empty files if (fileSize == 0) { - return Status::NotSupported( + return IOStatus::NotSupported( "NewMemoryMappedFileBuffer can not map zero length files: " + fname); } // size_t is 32-bit with 32-bit builds if (fileSize > std::numeric_limits::max()) { - return Status::NotSupported( - "The specified file size does not fit into 32-bit memory addressing: " - + fname); + return IOStatus::NotSupported( + "The specified file size does not fit into 32-bit memory addressing: " + + fname); } HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE, @@ -487,15 +572,16 @@ Status WinEnvIO::NewMemoryMappedFileBuffer( return s; } -Status WinEnvIO::NewDirectory(const std::string& name, - std::unique_ptr* result) { - Status s; +IOStatus WinFileSystem::NewDirectory(const std::string& name, + const IOOptions& /*options*/, + std::unique_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; // Must be nullptr on failure result->reset(); if (!DirExists(name)) { - s = IOErrorFromWindowsError( - "open folder: " + name, ERROR_DIRECTORY); + s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY); return s; } @@ -505,10 +591,9 @@ Status WinEnvIO::NewDirectory(const std::string& name, IOSTATS_TIMER_GUARD(open_nanos); handle = RX_CreateFile( RX_FN(name).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible NULL); } @@ -523,8 +608,10 @@ Status WinEnvIO::NewDirectory(const std::string& name, return s; } -Status WinEnvIO::FileExists(const std::string& fname) { - Status s; +IOStatus WinFileSystem::FileExists(const std::string& fname, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus s; // TODO: This does not follow symbolic links at this point // which is consistent with _access() impl on windows // but can be added @@ -533,70 +620,74 @@ Status WinEnvIO::FileExists(const std::string& fname) { GetFileExInfoStandard, &attrs)) { auto lastError = GetLastError(); switch (lastError) { - case ERROR_ACCESS_DENIED: - case ERROR_NOT_FOUND: - case ERROR_FILE_NOT_FOUND: - case ERROR_PATH_NOT_FOUND: - s = Status::NotFound(); - break; - default: - s = IOErrorFromWindowsError("Unexpected error for: " + fname, - lastError); - break; + case ERROR_ACCESS_DENIED: + case ERROR_NOT_FOUND: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + s = IOStatus::NotFound(); + break; + default: + s = IOErrorFromWindowsError("Unexpected error for: " + fname, + lastError); + break; } } return s; } -Status WinEnvIO::GetChildren(const std::string& dir, - std::vector* result) { - - Status status; +IOStatus WinFileSystem::GetChildren(const std::string& dir, + const IOOptions& /*opts*/, + std::vector* result, + IODebugContext* /*dbg*/) { + IOStatus status; result->clear(); - std::vector output; RX_WIN32_FIND_DATA data; memset(&data, 0, sizeof(data)); std::string pattern(dir); pattern.append("\\").append("*"); - HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(), - // Do not want alternative name - FindExInfoBasic, - &data, - FindExSearchNameMatch, - NULL, // lpSearchFilter - 0); + HANDLE handle = + RX_FindFirstFileEx(RX_FN(pattern).c_str(), + // Do not want alternative name + FindExInfoBasic, &data, FindExSearchNameMatch, + NULL, // lpSearchFilter + 0); if (handle == INVALID_HANDLE_VALUE) { auto lastError = GetLastError(); switch (lastError) { - case ERROR_NOT_FOUND: - case ERROR_ACCESS_DENIED: - case ERROR_FILE_NOT_FOUND: - case ERROR_PATH_NOT_FOUND: - status = Status::NotFound(); - break; - default: - status = IOErrorFromWindowsError( - "Failed to GetChhildren for: " + dir, lastError); + case ERROR_NOT_FOUND: + case ERROR_ACCESS_DENIED: + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + status = IOStatus::NotFound(); + break; + default: + status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir, + lastError); } return status; } UniqueFindClosePtr fc(handle, FindCloseFunc); - if (result->capacity() > 0) { - output.reserve(result->capacity()); - } - // For safety data.cFileName[MAX_PATH - 1] = 0; while (true) { - auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); - output.emplace_back(FN_TO_RX(x)); - BOOL ret =- RX_FindNextFile(handle, &data); + // filter out '.' and '..' directory entries + // which appear only on some platforms + const bool ignore = + ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) && + (RX_FNCMP(data.cFileName, ".") == 0 || + RX_FNCMP(data.cFileName, "..") == 0); + if (!ignore) { + auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName)); + result->push_back(FN_TO_RX(x)); + } + + BOOL ret = -RX_FindNextFile(handle, &data); // If the function fails the return value is zero // and non-zero otherwise. Not TRUE or FALSE. if (ret == FALSE) { @@ -605,24 +696,27 @@ Status WinEnvIO::GetChildren(const std::string& dir, } data.cFileName[MAX_PATH - 1] = 0; } - output.swap(*result); return status; } -Status WinEnvIO::CreateDir(const std::string& name) { - Status result; +IOStatus WinFileSystem::CreateDir(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL); if (!ret) { auto lastError = GetLastError(); - result = IOErrorFromWindowsError( - "Failed to create a directory: " + name, lastError); + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); } return result; } -Status WinEnvIO::CreateDirIfMissing(const std::string& name) { - Status result; +IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; if (DirExists(name)) { return result; @@ -632,30 +726,32 @@ Status WinEnvIO::CreateDirIfMissing(const std::string& name) { if (!ret) { auto lastError = GetLastError(); if (lastError != ERROR_ALREADY_EXISTS) { - result = IOErrorFromWindowsError( - "Failed to create a directory: " + name, lastError); + result = IOErrorFromWindowsError("Failed to create a directory: " + name, + lastError); } else { - result = - Status::IOError(name + ": exists but is not a directory"); + result = IOStatus::IOError(name + ": exists but is not a directory"); } } return result; } -Status WinEnvIO::DeleteDir(const std::string& name) { - Status result; +IOStatus WinFileSystem::DeleteDir(const std::string& name, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus result; BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str()); if (!ret) { auto lastError = GetLastError(); - result = IOErrorFromWindowsError("Failed to remove dir: " + name, - lastError); + result = + IOErrorFromWindowsError("Failed to remove dir: " + name, lastError); } return result; } -Status WinEnvIO::GetFileSize(const std::string& fname, - uint64_t* size) { - Status s; +IOStatus WinFileSystem::GetFileSize(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* size, + IODebugContext* /*dbg*/) { + IOStatus s; WIN32_FILE_ATTRIBUTE_DATA attrs; if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, @@ -671,7 +767,7 @@ Status WinEnvIO::GetFileSize(const std::string& fname, return s; } -uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) { +uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) { const uint64_t c_FileTimePerSecond = 10000000U; // UNIX epoch starts on 1970-01-01T00:00:00Z // Windows FILETIME starts on 1601-01-01T00:00:00Z @@ -685,31 +781,35 @@ uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) { li.LowPart = ftTime.dwLowDateTime; uint64_t result = - (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; + (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch; return result; } -Status WinEnvIO::GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) { - Status s; +IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname, + const IOOptions& /*opts*/, + uint64_t* file_mtime, + IODebugContext* /*dbg*/) { + IOStatus s; WIN32_FILE_ATTRIBUTE_DATA attrs; if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard, - &attrs)) { + &attrs)) { *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime); } else { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Can not get file modification time for: " + fname, lastError); + "Can not get file modification time for: " + fname, lastError); *file_mtime = 0; } return s; } -Status WinEnvIO::RenameFile(const std::string& src, - const std::string& target) { - Status result; +IOStatus WinFileSystem::RenameFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; // rename() is not capable of replacing the existing file as on Linux // so use OS API directly @@ -726,14 +826,16 @@ Status WinEnvIO::RenameFile(const std::string& src, return result; } -Status WinEnvIO::LinkFile(const std::string& src, - const std::string& target) { - Status result; +IOStatus WinFileSystem::LinkFile(const std::string& src, + const std::string& target, + const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; - if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { + if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) { DWORD lastError = GetLastError(); if (lastError == ERROR_NOT_SAME_DEVICE) { - return Status::NotSupported("No cross FS links allowed"); + return IOStatus::NotSupported("No cross FS links allowed"); } std::string text("Failed to link: "); @@ -745,12 +847,14 @@ Status WinEnvIO::LinkFile(const std::string& src, return result; } -Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) { - Status s; - HANDLE handle = RX_CreateFile( - RX_FN(fname).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); +IOStatus WinFileSystem::NumFileLinks(const std::string& fname, + const IOOptions& /*opts*/, uint64_t* count, + IODebugContext* /*dbg*/) { + IOStatus s; + HANDLE handle = + RX_CreateFile(RX_FN(fname).c_str(), 0, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); if (INVALID_HANDLE_VALUE == handle) { auto lastError = GetLastError(); @@ -771,26 +875,27 @@ Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) { return s; } -Status WinEnvIO::AreFilesSame(const std::string& first, - const std::string& second, bool* res) { +IOStatus WinFileSystem::AreFilesSame(const std::string& first, + const std::string& second, + const IOOptions& /*opts*/, bool* res, + IODebugContext* /*dbg*/) { // For MinGW builds #if (_WIN32_WINNT == _WIN32_WINNT_VISTA) - Status s = Status::NotSupported(); + IOStatus s = IOStatus::NotSupported(); #else assert(res != nullptr); - Status s; + IOStatus s; if (res == nullptr) { - s = Status::InvalidArgument("res"); + s = IOStatus::InvalidArgument("res"); return s; } // 0 - for access means read metadata HANDLE file_1 = RX_CreateFile( RX_FN(first).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible NULL); if (INVALID_HANDLE_VALUE == file_1) { @@ -802,9 +907,9 @@ Status WinEnvIO::AreFilesSame(const std::string& first, HANDLE file_2 = RX_CreateFile( RX_FN(second).c_str(), 0, - FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, - NULL, OPEN_EXISTING, - FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible + FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_EXISTING, + FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible NULL); if (INVALID_HANDLE_VALUE == file_2) { @@ -824,9 +929,9 @@ Status WinEnvIO::AreFilesSame(const std::string& first, return s; } - FILE_ID_INFO FileInfo_2; - result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, - sizeof(FileInfo_2)); + FILE_ID_INFO FileInfo_2; + result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2, + sizeof(FileInfo_2)); if (!result) { auto lastError = GetLastError(); @@ -835,9 +940,9 @@ Status WinEnvIO::AreFilesSame(const std::string& first, } if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) { - *res = (0 == memcmp(FileInfo_1.FileId.Identifier, - FileInfo_2.FileId.Identifier, - sizeof(FileInfo_1.FileId.Identifier))); + *res = + (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier, + sizeof(FileInfo_1.FileId.Identifier))); } else { *res = false; } @@ -845,12 +950,13 @@ Status WinEnvIO::AreFilesSame(const std::string& first, return s; } -Status WinEnvIO::LockFile(const std::string& lockFname, - FileLock** lock) { +IOStatus WinFileSystem::LockFile(const std::string& lockFname, + const IOOptions& /*opts*/, FileLock** lock, + IODebugContext* /*dbg*/) { assert(lock != nullptr); *lock = NULL; - Status result; + IOStatus result; // No-sharing, this is a LOCK file const DWORD ExclusiveAccessON = 0; @@ -862,15 +968,14 @@ Status WinEnvIO::LockFile(const std::string& lockFname, { IOSTATS_TIMER_GUARD(open_nanos); hFile = RX_CreateFile(RX_FN(lockFname).c_str(), - (GENERIC_READ | GENERIC_WRITE), - ExclusiveAccessON, NULL, CREATE_ALWAYS, - FILE_ATTRIBUTE_NORMAL, NULL); + (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON, + NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); } if (INVALID_HANDLE_VALUE == hFile) { auto lastError = GetLastError(); - result = IOErrorFromWindowsError( - "Failed to create lock file: " + lockFname, lastError); + result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname, + lastError); } else { *lock = new WinFileLock(hFile); } @@ -878,8 +983,9 @@ Status WinEnvIO::LockFile(const std::string& lockFname, return result; } -Status WinEnvIO::UnlockFile(FileLock* lock) { - Status result; +IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) { + IOStatus result; assert(lock != nullptr); @@ -888,8 +994,9 @@ Status WinEnvIO::UnlockFile(FileLock* lock) { return result; } -Status WinEnvIO::GetTestDirectory(std::string* result) { - +IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts, + std::string* result, + IODebugContext* dbg) { std::string output; const char* env = getenv("TEST_TMPDIR"); @@ -904,21 +1011,23 @@ Status WinEnvIO::GetTestDirectory(std::string* result) { output = "c:\\tmp"; } } - CreateDir(output); + CreateDir(output, opts, dbg); output.append("\\testrocksdb-"); output.append(std::to_string(GetCurrentProcessId())); - CreateDir(output); + CreateDir(output, opts, dbg); output.swap(*result); - return Status::OK(); + return IOStatus::OK(); } -Status WinEnvIO::NewLogger(const std::string& fname, - std::shared_ptr* result) { - Status s; +IOStatus WinFileSystem::NewLogger(const std::string& fname, + const IOOptions& /*opts*/, + std::shared_ptr* result, + IODebugContext* /*dbg*/) { + IOStatus s; result->reset(); @@ -951,72 +1060,25 @@ Status WinEnvIO::NewLogger(const std::string& fname, // Set creation, last access and last write time to the same value SetFileTime(hFile, &ft, &ft, &ft); } - result->reset(new WinLogger(&WinEnvThreads::gettid, hosted_env_, hFile)); + result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile)); } return s; } -Status WinEnvIO::IsDirectory(const std::string& path, bool* is_dir) { +IOStatus WinFileSystem::IsDirectory(const std::string& path, + const IOOptions& /*opts*/, bool* is_dir, + IODebugContext* /*dbg*/) { BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str()); if (is_dir) { *is_dir = ret ? true : false; } - return Status::OK(); -} - -uint64_t WinEnvIO::NowMicros() { - - if (GetSystemTimePreciseAsFileTime_ != NULL) { - // all std::chrono clocks on windows proved to return - // values that may repeat that is not good enough for some uses. - const int64_t c_UnixEpochStartTicks = 116444736000000000LL; - const int64_t c_FtToMicroSec = 10; - - // This interface needs to return system time and not - // just any microseconds because it is often used as an argument - // to TimedWait() on condition variable - FILETIME ftSystemTime; - GetSystemTimePreciseAsFileTime_(&ftSystemTime); - - LARGE_INTEGER li; - li.LowPart = ftSystemTime.dwLowDateTime; - li.HighPart = ftSystemTime.dwHighDateTime; - // Subtract unix epoch start - li.QuadPart -= c_UnixEpochStartTicks; - // Convert to microsecs - li.QuadPart /= c_FtToMicroSec; - return li.QuadPart; - } - using namespace std::chrono; - return duration_cast(system_clock::now().time_since_epoch()) - .count(); -} - -uint64_t WinEnvIO::NowNanos() { - if (nano_seconds_per_period_ != 0) { - // all std::chrono clocks on windows have the same resolution that is only - // good enough for microseconds but not nanoseconds - // On Windows 8 and Windows 2012 Server - // GetSystemTimePreciseAsFileTime(¤t_time) can be used - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - // Convert performance counter to nanoseconds by precomputed ratio. - // Directly multiply nano::den with li.QuadPart causes overflow. - // Only do this when nano::den is divisible by perf_counter_frequency_, - // which most likely is the case in reality. If it's not, fall back to - // high_resolution_clock, which may be less precise under old compilers. - li.QuadPart *= nano_seconds_per_period_; - return li.QuadPart; - } - using namespace std::chrono; - return duration_cast( - high_resolution_clock::now().time_since_epoch()).count(); + return IOStatus::OK(); } Status WinEnvIO::GetHostName(char* name, uint64_t len) { Status s; DWORD nSize = static_cast( - std::min(len, std::numeric_limits::max())); + std::min(len, std::numeric_limits::max())); if (!::GetComputerNameA(name, &nSize)) { auto lastError = GetLastError(); @@ -1028,15 +1090,17 @@ Status WinEnvIO::GetHostName(char* name, uint64_t len) { return s; } -Status WinEnvIO::GetAbsolutePath(const std::string& db_path, - std::string* output_path) { +IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path, + const IOOptions& /*options*/, + std::string* output_path, + IODebugContext* dbg) { // Check if we already have an absolute path // For test compatibility we will consider starting slash as an // absolute path if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) || - !RX_PathIsRelative(RX_FN(db_path).c_str())) { + !RX_PathIsRelative(RX_FN(db_path).c_str())) { *output_path = db_path; - return Status::OK(); + return IOStatus::OK(); } RX_FILESTRING result; @@ -1055,42 +1119,19 @@ Status WinEnvIO::GetAbsolutePath(const std::string& db_path, std::string res = FN_TO_RX(result); res.swap(*output_path); - return Status::OK(); -} - -std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) { - std::string result; - - const time_t seconds = secondsSince1970; - const int maxsize = 64; - - struct tm t; - errno_t ret = localtime_s(&t, &seconds); - - if (ret) { - result = std::to_string(seconds); - } else { - result.resize(maxsize); - char* p = &result[0]; - - int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", - t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, - t.tm_min, t.tm_sec); - assert(len > 0); - - result.resize(len); - } - - return result; + return IOStatus::OK(); } -Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) { +IOStatus WinFileSystem::GetFreeSpace(const std::string& path, + const IOOptions& /*options*/, + uint64_t* diskfree, + IODebugContext* /*dbg*/) { assert(diskfree != nullptr); ULARGE_INTEGER freeBytes; BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL); if (f) { *diskfree = freeBytes.QuadPart; - return Status::OK(); + return IOStatus::OK(); } else { DWORD lastError = GetLastError(); return IOErrorFromWindowsError("Failed to get free space: " + path, @@ -1098,9 +1139,9 @@ Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) { } } -EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const { - EnvOptions optimized(env_options); +FileOptions WinFileSystem::OptimizeForLogWrite( + const FileOptions& file_options, const DBOptions& db_options) const { + FileOptions optimized(file_options); // These two the same as default optimizations optimized.bytes_per_sync = db_options.wal_bytes_per_sync; optimized.writable_file_max_buffer_size = @@ -1114,33 +1155,33 @@ EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options, return optimized; } -EnvOptions WinEnvIO::OptimizeForManifestWrite( - const EnvOptions& env_options) const { - EnvOptions optimized(env_options); +FileOptions WinFileSystem::OptimizeForManifestWrite( + const FileOptions& options) const { + FileOptions optimized(options); optimized.use_mmap_writes = false; optimized.use_direct_reads = false; return optimized; } -EnvOptions WinEnvIO::OptimizeForManifestRead( - const EnvOptions& env_options) const { - EnvOptions optimized(env_options); +FileOptions WinFileSystem::OptimizeForManifestRead( + const FileOptions& file_options) const { + FileOptions optimized(file_options); optimized.use_mmap_writes = false; optimized.use_direct_reads = false; return optimized; } // Returns true iff the named directory exists and is a directory. -bool WinEnvIO::DirExists(const std::string& dname) { +bool WinFileSystem::DirExists(const std::string& dname) { WIN32_FILE_ATTRIBUTE_DATA attrs; - if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), - GetFileExInfoStandard, &attrs)) { + if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard, + &attrs)) { return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY); } return false; } -size_t WinEnvIO::GetSectorSize(const std::string& fname) { +size_t WinFileSystem::GetSectorSize(const std::string& fname) { size_t sector_size = kSectorSize; if (RX_PathIsRelative(RX_FN(fname).c_str())) { @@ -1170,21 +1211,21 @@ size_t WinEnvIO::GetSectorSize(const std::string& fname) { BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)]; DWORD output_bytes = 0; - BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY, - &spropertyquery, sizeof(spropertyquery), - output_buffer, - sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), - &output_bytes, nullptr); + BOOL ret = DeviceIoControl( + hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery, + sizeof(spropertyquery), output_buffer, + sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr); if (ret) { - sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector; + sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer) + ->BytesPerLogicalSector; } else { - // many devices do not support StorageProcessAlignmentProperty. Any failure here and we - // fall back to logical alignment + // many devices do not support StorageProcessAlignmentProperty. Any failure + // here and we fall back to logical alignment - DISK_GEOMETRY_EX geometry = { 0 }; - ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, - nullptr, 0, &geometry, sizeof(geometry), &output_bytes, nullptr); + DISK_GEOMETRY_EX geometry = {0}; + ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0, + &geometry, sizeof(geometry), &output_bytes, nullptr); if (ret) { sector_size = geometry.Geometry.BytesPerSector; } @@ -1202,17 +1243,15 @@ size_t WinEnvIO::GetSectorSize(const std::string& fname) { WinEnvThreads::WinEnvThreads(Env* hosted_env) : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) { - for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { thread_pools_[pool_id].SetThreadPriority( - static_cast(pool_id)); + static_cast(pool_id)); // This allows later initializing the thread-local-env of each thread. thread_pools_[pool_id].SetHostEnv(hosted_env); } } WinEnvThreads::~WinEnvThreads() { - WaitForJoin(); for (auto& thpool : thread_pools_) { @@ -1220,9 +1259,9 @@ WinEnvThreads::~WinEnvThreads() { } } -void WinEnvThreads::Schedule(void(*function)(void*), void* arg, +void WinEnvThreads::Schedule(void (*function)(void*), void* arg, Env::Priority pri, void* tag, - void(*unschedFunction)(void* arg)) { + void (*unschedFunction)(void* arg)) { assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); } @@ -1233,21 +1272,21 @@ int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) { namespace { - struct StartThreadState { - void(*user_function)(void*); - void* arg; - }; +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; - void* StartThreadWrapper(void* arg) { - std::unique_ptr state( +void* StartThreadWrapper(void* arg) { + std::unique_ptr state( reinterpret_cast(arg)); - state->user_function(state->arg); - return nullptr; - } - + state->user_function(state->arg); + return nullptr; } -void WinEnvThreads::StartThread(void(*function)(void* arg), void* arg) { +} // namespace + +void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) { std::unique_ptr state(new StartThreadState); state->user_function = function; state->arg = arg; @@ -1282,10 +1321,6 @@ uint64_t WinEnvThreads::gettid() { uint64_t WinEnvThreads::GetThreadID() const { return gettid(); } -void WinEnvThreads::SleepForMicroseconds(int micros) { - std::this_thread::sleep_for(std::chrono::microseconds(micros)); -} - void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) { assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); thread_pools_[pri].SetBackgroundThreads(num); @@ -1304,12 +1339,14 @@ void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { ///////////////////////////////////////////////////////////////////////// // WinEnv -WinEnv::WinEnv() : winenv_io_(this), winenv_threads_(this) { +WinEnv::WinEnv() + : CompositeEnv(WinFileSystem::Default(), WinClock::Default()), + winenv_io_(this), + winenv_threads_(this) { // Protected member of the base class thread_status_updater_ = CreateThreadStatusUpdater(); } - WinEnv::~WinEnv() { // All threads must be joined before the deletion of // thread_status_updater_. @@ -1321,155 +1358,12 @@ Status WinEnv::GetThreadList(std::vector* thread_list) { return thread_status_updater_->GetThreadList(thread_list); } -Status WinEnv::DeleteFile(const std::string& fname) { - return winenv_io_.DeleteFile(fname); -} - -Status WinEnv::Truncate(const std::string& fname, size_t size) { - return winenv_io_.Truncate(fname, size); -} - -Status WinEnv::GetCurrentTime(int64_t* unix_time) { - return winenv_io_.GetCurrentTime(unix_time); -} - -Status WinEnv::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.NewSequentialFile(fname, result, options); -} - -Status WinEnv::NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.NewRandomAccessFile(fname, result, options); -} - -Status WinEnv::NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.OpenWritableFile(fname, result, options, false); -} - -Status WinEnv::ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - return winenv_io_.OpenWritableFile(fname, result, options, true); -} - -Status WinEnv::NewRandomRWFile(const std::string & fname, - std::unique_ptr* result, - const EnvOptions & options) { - return winenv_io_.NewRandomRWFile(fname, result, options); -} - -Status WinEnv::NewMemoryMappedFileBuffer( - const std::string& fname, - std::unique_ptr* result) { - return winenv_io_.NewMemoryMappedFileBuffer(fname, result); -} - -Status WinEnv::NewDirectory(const std::string& name, - std::unique_ptr* result) { - return winenv_io_.NewDirectory(name, result); -} - -Status WinEnv::FileExists(const std::string& fname) { - return winenv_io_.FileExists(fname); -} - -Status WinEnv::GetChildren(const std::string& dir, - std::vector* result) { - return winenv_io_.GetChildren(dir, result); -} - -Status WinEnv::CreateDir(const std::string& name) { - return winenv_io_.CreateDir(name); -} - -Status WinEnv::CreateDirIfMissing(const std::string& name) { - return winenv_io_.CreateDirIfMissing(name); -} - -Status WinEnv::DeleteDir(const std::string& name) { - return winenv_io_.DeleteDir(name); -} - -Status WinEnv::GetFileSize(const std::string& fname, - uint64_t* size) { - return winenv_io_.GetFileSize(fname, size); -} - -Status WinEnv::GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) { - return winenv_io_.GetFileModificationTime(fname, file_mtime); -} - -Status WinEnv::RenameFile(const std::string& src, - const std::string& target) { - return winenv_io_.RenameFile(src, target); -} - -Status WinEnv::LinkFile(const std::string& src, - const std::string& target) { - return winenv_io_.LinkFile(src, target); -} - -Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) { - return winenv_io_.NumFileLinks(fname, count); -} - -Status WinEnv::AreFilesSame(const std::string& first, - const std::string& second, bool* res) { - return winenv_io_.AreFilesSame(first, second, res); -} - -Status WinEnv::LockFile(const std::string& lockFname, - FileLock** lock) { - return winenv_io_.LockFile(lockFname, lock); -} - -Status WinEnv::UnlockFile(FileLock* lock) { - return winenv_io_.UnlockFile(lock); -} - -Status WinEnv::GetTestDirectory(std::string* result) { - return winenv_io_.GetTestDirectory(result); -} - -Status WinEnv::NewLogger(const std::string& fname, - std::shared_ptr* result) { - return winenv_io_.NewLogger(fname, result); -} - -Status WinEnv::IsDirectory(const std::string& path, bool* is_dir) { - return winenv_io_.IsDirectory(path, is_dir); -} - -uint64_t WinEnv::NowMicros() { - return winenv_io_.NowMicros(); -} - -uint64_t WinEnv::NowNanos() { - return winenv_io_.NowNanos(); -} - Status WinEnv::GetHostName(char* name, uint64_t len) { return winenv_io_.GetHostName(name, len); } -Status WinEnv::GetAbsolutePath(const std::string& db_path, - std::string* output_path) { - return winenv_io_.GetAbsolutePath(db_path, output_path); -} - -std::string WinEnv::TimeToString(uint64_t secondsSince1970) { - return winenv_io_.TimeToString(secondsSince1970); -} - -void WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri, - void* tag, - void(*unschedFunction)(void* arg)) { +void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction); } @@ -1477,32 +1371,20 @@ int WinEnv::UnSchedule(void* arg, Env::Priority pri) { return winenv_threads_.UnSchedule(arg, pri); } -void WinEnv::StartThread(void(*function)(void* arg), void* arg) { +void WinEnv::StartThread(void (*function)(void* arg), void* arg) { return winenv_threads_.StartThread(function, arg); } -void WinEnv::WaitForJoin() { - return winenv_threads_.WaitForJoin(); -} +void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); } -unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { +unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const { return winenv_threads_.GetThreadPoolQueueLen(pri); } -uint64_t WinEnv::GetThreadID() const { - return winenv_threads_.GetThreadID(); -} - -Status WinEnv::GetFreeSpace(const std::string& path, uint64_t* diskfree) { - return winenv_io_.GetFreeSpace(path, diskfree); -} - -void WinEnv::SleepForMicroseconds(int micros) { - return winenv_threads_.SleepForMicroseconds(micros); -} +uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); } // Allow increasing the number of worker threads. -void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { +void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) { return winenv_threads_.SetBackgroundThreads(num, pri); } @@ -1510,25 +1392,10 @@ int WinEnv::GetBackgroundThreads(Env::Priority pri) { return winenv_threads_.GetBackgroundThreads(pri); } -void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { +void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri); } -EnvOptions WinEnv::OptimizeForManifestRead( - const EnvOptions& env_options) const { - return winenv_io_.OptimizeForManifestRead(env_options); -} - -EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const { - return winenv_io_.OptimizeForLogWrite(env_options, db_options); -} - -EnvOptions WinEnv::OptimizeForManifestWrite( - const EnvOptions& env_options) const { - return winenv_io_.OptimizeForManifestWrite(env_options); -} - } // namespace port std::string Env::GenerateUniqueId() { @@ -1550,6 +1417,19 @@ std::string Env::GenerateUniqueId() { return result; } +std::shared_ptr FileSystem::Default() { + return port::WinFileSystem::Default(); +} + +const std::shared_ptr& SystemClock::Default() { + static std::shared_ptr clock = + std::make_shared(); + return clock; +} + +std::unique_ptr NewCompositeEnv(const std::shared_ptr& fs) { + return std::unique_ptr(new CompositeEnvWrapper(Env::Default(), fs)); +} } // namespace ROCKSDB_NAMESPACE #endif diff --git a/port/win/env_win.h b/port/win/env_win.h index 24e3a56fe36..54d3e7dbf0c 100644 --- a/port/win/env_win.h +++ b/port/win/env_win.h @@ -15,30 +15,30 @@ // multiple threads without any external synchronization. #pragma once - -#include "port/win/win_thread.h" -#include -#include "util/threadpool_imp.h" - #include #include #include -#include #include +#include +#include "env/composite_env_wrapper.h" +#include "port/win/win_thread.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" +#include "rocksdb/system_clock.h" +#include "util/threadpool_imp.h" #undef GetCurrentTime #undef DeleteFile -#undef GetTickCount +#undef LoadLibrary namespace ROCKSDB_NAMESPACE { namespace port { // Currently not designed for inheritance but rather a replacement class WinEnvThreads { -public: - + public: explicit WinEnvThreads(Env* hosted_env); ~WinEnvThreads(); @@ -46,12 +46,12 @@ class WinEnvThreads { WinEnvThreads(const WinEnvThreads&) = delete; WinEnvThreads& operator=(const WinEnvThreads&) = delete; - void Schedule(void(*function)(void*), void* arg, Env::Priority pri, - void* tag, void(*unschedFunction)(void* arg)); + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)); int UnSchedule(void* arg, Env::Priority pri); - void StartThread(void(*function)(void* arg), void* arg); + void StartThread(void (*function)(void* arg), void* arg); void WaitForJoin(); @@ -61,255 +61,211 @@ class WinEnvThreads { uint64_t GetThreadID() const; - void SleepForMicroseconds(int micros); - // Allow increasing the number of worker threads. void SetBackgroundThreads(int num, Env::Priority pri); int GetBackgroundThreads(Env::Priority pri); void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri); -private: - + private: Env* hosted_env_; mutable std::mutex mu_; std::vector thread_pools_; std::vector threads_to_join_; - }; -// Designed for inheritance so can be re-used -// but certain parts replaced -class WinEnvIO { -public: - explicit WinEnvIO(Env* hosted_env); - - virtual ~WinEnvIO(); - - virtual Status DeleteFile(const std::string& fname); - - Status Truncate(const std::string& fname, size_t size); - - virtual Status GetCurrentTime(int64_t* unix_time); - - virtual Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options); - - // Helper for NewWritable and ReopenWritableFile - virtual Status OpenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options, - bool reopen); - - virtual Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options); - - // The returned file will only be accessed by one thread at a time. - virtual Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options); - - virtual Status NewMemoryMappedFileBuffer( - const std::string& fname, - std::unique_ptr* result); - - virtual Status NewDirectory(const std::string& name, - std::unique_ptr* result); - - virtual Status FileExists(const std::string& fname); - - virtual Status GetChildren(const std::string& dir, - std::vector* result); - - virtual Status CreateDir(const std::string& name); +class WinClock : public SystemClock { + public: + WinClock(); + virtual ~WinClock() {} - virtual Status CreateDirIfMissing(const std::string& name); + const char* Name() const override { return "WindowsClock"; } - virtual Status DeleteDir(const std::string& name); - - virtual Status GetFileSize(const std::string& fname, uint64_t* size); - - static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); - - virtual Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime); - - virtual Status RenameFile(const std::string& src, const std::string& target); - - virtual Status LinkFile(const std::string& src, const std::string& target); - - virtual Status NumFileLinks(const std::string& /*fname*/, - uint64_t* /*count*/); - - virtual Status AreFilesSame(const std::string& first, - const std::string& second, bool* res); - - virtual Status LockFile(const std::string& lockFname, FileLock** lock); - - virtual Status UnlockFile(FileLock* lock); - - virtual Status GetTestDirectory(std::string* result); - - virtual Status NewLogger(const std::string& fname, - std::shared_ptr* result); - - virtual Status IsDirectory(const std::string& path, bool* is_dir); - - virtual uint64_t NowMicros(); - - virtual uint64_t NowNanos(); - - virtual Status GetHostName(char* name, uint64_t len); - - virtual Status GetAbsolutePath(const std::string& db_path, - std::string* output_path); - - // This seems to clash with a macro on Windows, so #undef it here -#undef GetFreeSpace - - // Get the amount of free disk space - virtual Status GetFreeSpace(const std::string& path, uint64_t* diskfree); - - virtual std::string TimeToString(uint64_t secondsSince1970); - - virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const; - - virtual EnvOptions OptimizeForManifestWrite( - const EnvOptions& env_options) const; + uint64_t NowMicros() override; - virtual EnvOptions OptimizeForManifestRead( - const EnvOptions& env_options) const; + uint64_t NowNanos() override; - size_t GetPageSize() const { return page_size_; } + // 0 indicates not supported + uint64_t CPUMicros() override { return 0; } + void SleepForMicroseconds(int micros) override; - size_t GetAllocationGranularity() const { return allocation_granularity_; } + Status GetCurrentTime(int64_t* unix_time) override; + // Converts seconds-since-Jan-01-1970 to a printable string + virtual std::string TimeToString(uint64_t time); uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } - static size_t GetSectorSize(const std::string& fname); - -private: - // Returns true iff the named directory exists and is a directory. - virtual bool DirExists(const std::string& dname); + private: + typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME); - typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME); - - Env* hosted_env_; - size_t page_size_; - size_t allocation_granularity_; uint64_t perf_counter_frequency_; uint64_t nano_seconds_per_period_; FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; }; -class WinEnv : public Env { -public: - WinEnv(); - - ~WinEnv(); - - Status DeleteFile(const std::string& fname) override; - - Status Truncate(const std::string& fname, size_t size) override; - - Status GetCurrentTime(int64_t* unix_time) override; +class WinFileSystem : public FileSystem { + public: + static const std::shared_ptr& Default(); + WinFileSystem(const std::shared_ptr& clock); + ~WinFileSystem() {} + const char* Name() const { return "WinFS"; } + static size_t GetSectorSize(const std::string& fname); + size_t GetPageSize() const { return page_size_; } + size_t GetAllocationGranularity() const { return allocation_granularity_; } - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // The returned file will only be accessed by one thread at a time. - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - Status NewMemoryMappedFileBuffer( + IOStatus DeleteFile(const std::string& fname, const IOOptions& options, + IODebugContext* dbg) override; + + // Truncate the named file to the specified size. + IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NewSequentialFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomAccessFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* /*dbg*/) override; + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override; + IOStatus ReopenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + IODebugContext* dbg) override; + + IOStatus NewRandomRWFile(const std::string& fname, + const FileOptions& file_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus NewMemoryMappedFileBuffer( const std::string& fname, std::unique_ptr* result) override; - Status NewDirectory(const std::string& name, - std::unique_ptr* result) override; - - Status FileExists(const std::string& fname) override; - - Status GetChildren(const std::string& dir, - std::vector* result) override; - - Status CreateDir(const std::string& name) override; - - Status CreateDirIfMissing(const std::string& name) override; - - Status DeleteDir(const std::string& name) override; - - Status GetFileSize(const std::string& fname, - uint64_t* size) override; - - Status GetFileModificationTime(const std::string& fname, - uint64_t* file_mtime) override; - - Status RenameFile(const std::string& src, - const std::string& target) override; - - Status LinkFile(const std::string& src, - const std::string& target) override; - - Status NumFileLinks(const std::string& fname, uint64_t* count) override; + IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts, + std::unique_ptr* result, + IODebugContext* dbg) override; + IOStatus FileExists(const std::string& f, const IOOptions& io_opts, + IODebugContext* dbg) override; + IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts, + std::vector* r, + IODebugContext* dbg) override; + IOStatus CreateDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + + // Creates directory if missing. Return Ok if it exists, or successful in + // Creating. + IOStatus CreateDirIfMissing(const std::string& dirname, + const IOOptions& options, + IODebugContext* dbg) override; + + // Delete the specified directory. + IOStatus DeleteDir(const std::string& dirname, const IOOptions& options, + IODebugContext* dbg) override; + // Store the size of fname in *file_size. + IOStatus GetFileSize(const std::string& fname, const IOOptions& options, + uint64_t* file_size, IODebugContext* dbg) override; + // Store the last modification time of fname in *file_mtime. + IOStatus GetFileModificationTime(const std::string& fname, + const IOOptions& options, + uint64_t* file_mtime, + IODebugContext* dbg) override; + // Rename file src to target. + IOStatus RenameFile(const std::string& src, const std::string& target, + const IOOptions& options, IODebugContext* dbg) override; + + // Hard Link file src to target. + IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override; + IOStatus NumFileLinks(const std::string& /*fname*/, + const IOOptions& /*options*/, uint64_t* /*count*/, + IODebugContext* /*dbg*/) override; + IOStatus AreFilesSame(const std::string& /*first*/, + const std::string& /*second*/, + const IOOptions& /*options*/, bool* /*res*/, + IODebugContext* /*dbg*/) override; + IOStatus LockFile(const std::string& fname, const IOOptions& options, + FileLock** lock, IODebugContext* dbg) override; + IOStatus UnlockFile(FileLock* lock, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus GetTestDirectory(const IOOptions& options, std::string* path, + IODebugContext* dbg) override; + + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can overide to provide custom + // logger. + IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts, + std::shared_ptr* result, + IODebugContext* dbg) override; + // Get full directory name for this db. + IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options, + std::string* output_path, + IODebugContext* dbg) override; + IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options, + bool* is_dir, IODebugContext* /*dgb*/) override; + // This seems to clash with a macro on Windows, so #undef it here +#undef GetFreeSpace + IOStatus GetFreeSpace(const std::string& /*path*/, + const IOOptions& /*options*/, uint64_t* /*diskfree*/, + IODebugContext* /*dbg*/) override; + FileOptions OptimizeForLogWrite(const FileOptions& file_options, + const DBOptions& db_options) const override; + FileOptions OptimizeForManifestRead( + const FileOptions& file_options) const override; + FileOptions OptimizeForManifestWrite( + const FileOptions& file_options) const override; + + protected: + static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); + // Returns true iff the named directory exists and is a directory. - Status AreFilesSame(const std::string& first, - const std::string& second, bool* res) override; + virtual bool DirExists(const std::string& dname); + // Helper for NewWritable and ReopenWritableFile + virtual IOStatus OpenWritableFile(const std::string& fname, + const FileOptions& options, + std::unique_ptr* result, + bool reopen); - Status LockFile(const std::string& lockFname, FileLock** lock) override; + private: + std::shared_ptr clock_; + size_t page_size_; + size_t allocation_granularity_; +}; - Status UnlockFile(FileLock* lock) override; +// Designed for inheritance so can be re-used +// but certain parts replaced +class WinEnvIO { + public: + explicit WinEnvIO(Env* hosted_env); - Status GetTestDirectory(std::string* result) override; + virtual ~WinEnvIO(); - Status NewLogger(const std::string& fname, - std::shared_ptr* result) override; + virtual Status GetHostName(char* name, uint64_t len); - Status IsDirectory(const std::string& path, bool* is_dir) override; + private: + Env* hosted_env_; +}; - uint64_t NowMicros() override; +class WinEnv : public CompositeEnv { + public: + WinEnv(); - uint64_t NowNanos() override; + ~WinEnv(); Status GetHostName(char* name, uint64_t len) override; - Status GetAbsolutePath(const std::string& db_path, - std::string* output_path) override; - - std::string TimeToString(uint64_t secondsSince1970) override; - Status GetThreadList(std::vector* thread_list) override; - void Schedule(void(*function)(void*), void* arg, Env::Priority pri, - void* tag, void(*unschedFunction)(void* arg)) override; + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) override; int UnSchedule(void* arg, Env::Priority pri) override; - void StartThread(void(*function)(void* arg), void* arg) override; + void StartThread(void (*function)(void* arg), void* arg) override; void WaitForJoin() override; @@ -317,35 +273,16 @@ class WinEnv : public Env { uint64_t GetThreadID() const override; - // This seems to clash with a macro on Windows, so #undef it here -#undef GetFreeSpace - - // Get the amount of free disk space - Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override; - - void SleepForMicroseconds(int micros) override; - // Allow increasing the number of worker threads. void SetBackgroundThreads(int num, Env::Priority pri) override; int GetBackgroundThreads(Env::Priority pri) override; void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; - EnvOptions OptimizeForManifestRead( - const EnvOptions& env_options) const override; - - EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, - const DBOptions& db_options) const override; - - EnvOptions OptimizeForManifestWrite( - const EnvOptions& env_options) const override; - - -private: - + private: WinEnvIO winenv_io_; WinEnvThreads winenv_threads_; }; -} // namespace port +} // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/win/io_win.cc b/port/win/io_win.cc index f8d1c3dbb83..1e662c06d91 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -20,36 +20,32 @@ namespace ROCKSDB_NAMESPACE { namespace port { /* -* DirectIOHelper -*/ + * DirectIOHelper + */ namespace { const size_t kSectorSize = 512; -inline -bool IsPowerOfTwo(const size_t alignment) { +inline bool IsPowerOfTwo(const size_t alignment) { return ((alignment) & (alignment - 1)) == 0; } -inline -bool IsSectorAligned(const size_t off) { +inline bool IsSectorAligned(const size_t off) { return (off & (kSectorSize - 1)) == 0; } -inline -bool IsAligned(size_t alignment, const void* ptr) { +inline bool IsAligned(size_t alignment, const void* ptr) { return ((uintptr_t(ptr)) & (alignment - 1)) == 0; } -} - +} // namespace std::string GetWindowsErrSz(DWORD err) { LPSTR lpMsgBuf; FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, err, - 0, // Default language - reinterpret_cast(&lpMsgBuf), 0, NULL); + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast(&lpMsgBuf), 0, NULL); std::string Err = lpMsgBuf; LocalFree(lpMsgBuf); @@ -69,21 +65,20 @@ std::string GetWindowsErrSz(DWORD err) { // Because all the reads/writes happen by the specified offset, the caller in // theory should not // rely on the current file offset. -Status pwrite(const WinFileData* file_data, const Slice& data, - uint64_t offset, size_t& bytes_written) { - - Status s; +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written) { + IOStatus s; bytes_written = 0; size_t num_bytes = data.size(); if (num_bytes > std::numeric_limits::max()) { // May happen in 64-bit builds where size_t is 64-bits but // long is still 32-bit, but that's the API here at the moment - return Status::InvalidArgument("num_bytes is too large for a single write: " + - file_data->GetName()); + return IOStatus::InvalidArgument( + "num_bytes is too large for a single write: " + file_data->GetName()); } - OVERLAPPED overlapped = { 0 }; + OVERLAPPED overlapped = {0}; ULARGE_INTEGER offsetUnion; offsetUnion.QuadPart = offset; @@ -92,11 +87,12 @@ Status pwrite(const WinFileData* file_data, const Slice& data, DWORD bytesWritten = 0; - if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast(num_bytes), - &bytesWritten, &overlapped)) { + if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), + static_cast(num_bytes), &bytesWritten, + &overlapped)) { auto lastError = GetLastError(); s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(), - lastError); + lastError); } else { bytes_written = bytesWritten; } @@ -105,18 +101,17 @@ Status pwrite(const WinFileData* file_data, const Slice& data, } // See comments for pwrite above -Status pread(const WinFileData* file_data, char* src, size_t num_bytes, - uint64_t offset, size_t& bytes_read) { - - Status s; +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read) { + IOStatus s; bytes_read = 0; if (num_bytes > std::numeric_limits::max()) { - return Status::InvalidArgument("num_bytes is too large for a single read: " + - file_data->GetName()); + return IOStatus::InvalidArgument( + "num_bytes is too large for a single read: " + file_data->GetName()); } - OVERLAPPED overlapped = { 0 }; + OVERLAPPED overlapped = {0}; ULARGE_INTEGER offsetUnion; offsetUnion.QuadPart = offset; @@ -125,13 +120,14 @@ Status pread(const WinFileData* file_data, char* src, size_t num_bytes, DWORD bytesRead = 0; - if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast(num_bytes), - &bytesRead, &overlapped)) { + if (FALSE == ReadFile(file_data->GetFileHandle(), src, + static_cast(num_bytes), &bytesRead, + &overlapped)) { auto lastError = GetLastError(); // EOF is OK with zero bytes read if (lastError != ERROR_HANDLE_EOF) { s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(), - lastError); + lastError); } } else { bytes_read = bytesRead; @@ -143,35 +139,34 @@ Status pread(const WinFileData* file_data, char* src, size_t num_bytes, // SetFileInformationByHandle() is capable of fast pre-allocates. // However, this does not change the file end position unless the file is // truncated and the pre-allocated space is not considered filled with zeros. -Status fallocate(const std::string& filename, HANDLE hFile, - uint64_t to_size) { - Status status; +IOStatus fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + IOStatus status; FILE_ALLOCATION_INFO alloc_info; alloc_info.AllocationSize.QuadPart = to_size; if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, - sizeof(FILE_ALLOCATION_INFO))) { + sizeof(FILE_ALLOCATION_INFO))) { auto lastError = GetLastError(); status = IOErrorFromWindowsError( - "Failed to pre-allocate space: " + filename, lastError); + "Failed to pre-allocate space: " + filename, lastError); } return status; } -Status ftruncate(const std::string& filename, HANDLE hFile, - uint64_t toSize) { - Status status; +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) { + IOStatus status; FILE_END_OF_FILE_INFO end_of_file; end_of_file.EndOfFile.QuadPart = toSize; if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, - sizeof(FILE_END_OF_FILE_INFO))) { + sizeof(FILE_END_OF_FILE_INFO))) { auto lastError = GetLastError(); status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, - lastError); + lastError); } return status; @@ -212,9 +207,11 @@ WinMmapReadableFile::~WinMmapReadableFile() { assert(ret); } -Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - Status s; +IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { + IOStatus s; if (offset > length_) { *result = Slice(); @@ -222,13 +219,12 @@ Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, } else if (offset + n > length_) { n = length_ - static_cast(offset); } - *result = - Slice(reinterpret_cast(mapped_region_)+offset, n); + *result = Slice(reinterpret_cast(mapped_region_) + offset, n); return s; } -Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { @@ -238,20 +234,19 @@ size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { /////////////////////////////////////////////////////////////////////////////// /// WinMmapFile - // Can only truncate or reserve to a sector size aligned if // used on files that are opened with Unbuffered I/O -Status WinMmapFile::TruncateFile(uint64_t toSize) { +IOStatus WinMmapFile::TruncateFile(uint64_t toSize) { return ftruncate(filename_, hFile_, toSize); } -Status WinMmapFile::UnmapCurrentRegion() { - Status status; +IOStatus WinMmapFile::UnmapCurrentRegion() { + IOStatus status; if (mapped_begin_ != nullptr) { if (!::UnmapViewOfFile(mapped_begin_)) { status = IOErrorFromWindowsError( - "Failed to unmap file view: " + filename_, GetLastError()); + "Failed to unmap file view: " + filename_, GetLastError()); } // Move on to the next portion of the file @@ -271,16 +266,16 @@ Status WinMmapFile::UnmapCurrentRegion() { return status; } -Status WinMmapFile::MapNewRegion() { - - Status status; +IOStatus WinMmapFile::MapNewRegion(const IOOptions& options, + IODebugContext* dbg) { + IOStatus status; assert(mapped_begin_ == nullptr); size_t minDiskSize = static_cast(file_offset_) + view_size_; if (minDiskSize > reserved_size_) { - status = Allocate(file_offset_, view_size_); + status = Allocate(file_offset_, view_size_, options, dbg); if (!status.ok()) { return status; } @@ -288,7 +283,6 @@ Status WinMmapFile::MapNewRegion() { // Need to remap if (hMap_ == NULL || reserved_size_ > mapping_size_) { - if (hMap_ != NULL) { // Unmap the previous one BOOL ret __attribute__((__unused__)); @@ -301,18 +295,18 @@ Status WinMmapFile::MapNewRegion() { mappingSize.QuadPart = reserved_size_; hMap_ = CreateFileMappingA( - hFile_, - NULL, // Security attributes - PAGE_READWRITE, // There is not a write only mode for mapping - mappingSize.HighPart, // Enable mapping the whole file but the actual - // amount mapped is determined by MapViewOfFile - mappingSize.LowPart, - NULL); // Mapping name + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name if (NULL == hMap_) { return IOErrorFromWindowsError( - "WindowsMmapFile failed to create file mapping for: " + filename_, - GetLastError()); + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); } mapping_size_ = reserved_size_; @@ -323,13 +317,13 @@ Status WinMmapFile::MapNewRegion() { // View must begin at the granularity aligned offset mapped_begin_ = reinterpret_cast( - MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, - view_size_, NULL)); + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); if (!mapped_begin_) { status = IOErrorFromWindowsError( - "WindowsMmapFile failed to map file view: " + filename_, - GetLastError()); + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); } else { mapped_end_ = mapped_begin_ + view_size_; dst_ = mapped_begin_; @@ -339,15 +333,15 @@ Status WinMmapFile::MapNewRegion() { return status; } -Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { +IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { return fallocate(filename_, hFile_, spaceToReserve); } WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, size_t allocation_granularity, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, hFile, false), - WritableFile(options), + FSWritableFile(options), hMap_(NULL), page_size_(page_size), allocation_granularity_(allocation_granularity), @@ -373,17 +367,19 @@ WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, // View size must be both the multiple of allocation_granularity AND the // page size and the granularity is usually a multiple of a page size. - const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + const size_t viewSize = + 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode view_size_ = Roundup(viewSize, allocation_granularity_); } WinMmapFile::~WinMmapFile() { if (hFile_) { - this->Close(); + this->Close(IOOptions(), nullptr); } } -Status WinMmapFile::Append(const Slice& data) { +IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) { const char* src = data.data(); size_t left = data.size(); @@ -392,9 +388,9 @@ Status WinMmapFile::Append(const Slice& data) { size_t avail = mapped_end_ - dst_; if (avail == 0) { - Status s = UnmapCurrentRegion(); + IOStatus s = UnmapCurrentRegion(); if (s.ok()) { - s = MapNewRegion(); + s = MapNewRegion(options, dbg); } if (!s.ok()) { @@ -416,30 +412,31 @@ Status WinMmapFile::Append(const Slice& data) { memset(dst_, 0, bytesToPad); } - return Status::OK(); + return IOStatus::OK(); } // Means Close() will properly take care of truncate // and it does not need any additional information -Status WinMmapFile::Truncate(uint64_t size) { - return Status::OK(); +IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); } -Status WinMmapFile::Close() { - Status s; +IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) { + IOStatus s; assert(NULL != hFile_); // We truncate to the precise size so no // uninitialized data at the end. SetEndOfFile // which we use does not write zeros and it is good. - uint64_t targetSize = GetFileSize(); + uint64_t targetSize = GetFileSize(options, dbg); if (mapped_begin_ != nullptr) { // Sync before unmapping to make sure everything // is on disk and there is not a lazy writing // so we are deterministic with the tests - Sync(); + Sync(options, dbg); s = UnmapCurrentRegion(); } @@ -448,14 +445,13 @@ Status WinMmapFile::Close() { if (!ret && s.ok()) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Failed to Close mapping for file: " + filename_, lastError); + "Failed to Close mapping for file: " + filename_, lastError); } hMap_ = NULL; } if (hFile_ != NULL) { - TruncateFile(targetSize); BOOL ret = ::CloseHandle(hFile_); @@ -464,18 +460,22 @@ Status WinMmapFile::Close() { if (!ret && s.ok()) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Failed to close file map handle: " + filename_, lastError); + "Failed to close file map handle: " + filename_, lastError); } } return s; } -Status WinMmapFile::Flush() { return Status::OK(); } +IOStatus WinMmapFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} // Flush only data -Status WinMmapFile::Sync() { - Status s; +IOStatus WinMmapFile::Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; // Some writes occurred since last sync if (dst_ > last_sync_) { @@ -485,15 +485,15 @@ Status WinMmapFile::Sync() { assert(dst_ < mapped_end_); size_t page_begin = - TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); size_t page_end = - TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); // Flush only the amount of that is a multiple of pages if (!::FlushViewOfFile(mapped_begin_ + page_begin, - (page_end - page_begin) + page_size_)) { + (page_end - page_begin) + page_size_)) { s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, - GetLastError()); + GetLastError()); } else { last_sync_ = dst_; } @@ -503,16 +503,16 @@ Status WinMmapFile::Sync() { } /** -* Flush data as well as metadata to stable storage. -*/ -Status WinMmapFile::Fsync() { - Status s = Sync(); + * Flush data as well as metadata to stable storage. + */ +IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + IOStatus s = Sync(options, dbg); // Flush metadata if (s.ok() && pending_sync_) { if (!::FlushFileBuffers(hFile_)) { s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, - GetLastError()); + GetLastError()); } pending_sync_ = false; } @@ -521,27 +521,31 @@ Status WinMmapFile::Fsync() { } /** -* Get the size of valid data in the file. This will not match the -* size that is returned from the filesystem because we use mmap -* to extend file by map_size every time. -*/ -uint64_t WinMmapFile::GetFileSize() { + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { size_t used = dst_ - mapped_begin_; return file_offset_ + used; } -Status WinMmapFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } -Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) { - Status status; - TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds); +IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus status; + TEST_KILL_RANDOM("WinMmapFile::Allocate"); // Make sure that we reserve an aligned amount of space // since the reservation block size is driven outside so we want // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(static_cast(offset + len), view_size_); + size_t spaceToReserve = + Roundup(static_cast(offset + len), view_size_); // Nothing to do if (spaceToReserve <= reserved_size_) { return status; @@ -563,31 +567,34 @@ size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const { // WinSequentialFile WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, f, options.use_direct_reads) {} WinSequentialFile::~WinSequentialFile() { assert(hFile_ != INVALID_HANDLE_VALUE); } -Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { - Status s; +IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { + IOStatus s; size_t r = 0; assert(result != nullptr); if (WinFileData::use_direct_io()) { - return Status::NotSupported("Read() does not support direct_io"); + return IOStatus::NotSupported("Read() does not support direct_io"); } // Windows ReadFile API accepts a DWORD. // While it is possible to read in a loop if n is too big // it is an unlikely case. if (n > std::numeric_limits::max()) { - return Status::InvalidArgument("n is too big for a single ReadFile: " - + filename_); + return IOStatus::InvalidArgument("n is too big for a single ReadFile: " + + filename_); } - DWORD bytesToRead = static_cast(n); //cast is safe due to the check above + DWORD bytesToRead = + static_cast(n); // cast is safe due to the check above DWORD bytesRead = 0; BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL); if (ret != FALSE) { @@ -595,8 +602,7 @@ Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { } else { auto lastError = GetLastError(); if (lastError != ERROR_HANDLE_EOF) { - s = IOErrorFromWindowsError("ReadFile failed: " + filename_, - lastError); + s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError); } } @@ -604,99 +610,91 @@ Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { return s; } -Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, - uint64_t offset, size_t& bytes_read) const { +IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const { return pread(this, src, numBytes, offset, bytes_read); } -Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) { - - Status s; - +IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n, + const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) { if (!WinFileData::use_direct_io()) { - return Status::NotSupported("This function is only used for direct_io"); + return IOStatus::NotSupported("This function is only used for direct_io"); } - if (!IsSectorAligned(static_cast(offset)) || - !IsSectorAligned(n)) { - return Status::InvalidArgument( + if (!IsSectorAligned(static_cast(offset)) || !IsSectorAligned(n)) { + return IOStatus::InvalidArgument( "WinSequentialFile::PositionedRead: offset is not properly aligned"); } - size_t bytes_read = 0; // out param - s = PositionedReadInternal(scratch, static_cast(n), offset, bytes_read); + size_t bytes_read = 0; // out param + IOStatus s = PositionedReadInternal(scratch, static_cast(n), offset, + bytes_read); *result = Slice(scratch, bytes_read); return s; } - -Status WinSequentialFile::Skip(uint64_t n) { - // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit - // integer. As such it is a highly unlikley case to have n so large. +IOStatus WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed + // 64-bit integer. As such it is a highly unlikley case to have n so large. if (n > static_cast(std::numeric_limits::max())) { - return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" + - filename_); + return IOStatus::InvalidArgument( + "n is too large for a single SetFilePointerEx() call" + filename_); } LARGE_INTEGER li; - li.QuadPart = static_cast(n); //cast is safe due to the check above + li.QuadPart = static_cast(n); // cast is safe due to the check + // above BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT); if (ret == FALSE) { auto lastError = GetLastError(); return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, lastError); } - return Status::OK(); + return IOStatus::OK(); } -Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } ////////////////////////////////////////////////////////////////////////////////////////////////// /// WinRandomAccessBase -inline -Status WinRandomAccessImpl::PositionedReadInternal(char* src, - size_t numBytes, - uint64_t offset, - size_t& bytes_read) const { +inline IOStatus WinRandomAccessImpl::PositionedReadInternal( + char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const { return pread(file_base_, src, numBytes, offset, bytes_read); } -inline -WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, - size_t alignment, - const EnvOptions& options) : - file_base_(file_base), - alignment_(alignment) { - +inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, + size_t alignment, + const FileOptions& options) + : file_base_(file_base), alignment_(alignment) { assert(!options.use_mmap_reads); } -inline -Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - - Status s; - +inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, + Slice* result, + char* scratch) const { // Check buffer alignment if (file_base_->use_direct_io()) { if (!IsSectorAligned(static_cast(offset)) || !IsAligned(alignment_, scratch)) { - return Status::InvalidArgument( - "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned"); + return IOStatus::InvalidArgument( + "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly " + "aligned"); } } if (n == 0) { *result = Slice(scratch, 0); - return s; + return IOStatus::OK(); } size_t bytes_read = 0; - s = PositionedReadInternal(scratch, n, offset, bytes_read); + IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read); *result = Slice(scratch, bytes_read); return s; } @@ -706,20 +704,21 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result, WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, hFile, options.use_direct_reads), WinRandomAccessImpl(this, alignment, options) {} -WinRandomAccessFile::~WinRandomAccessFile() { -} +WinRandomAccessFile::~WinRandomAccessFile() {} -Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { +IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, + IODebugContext* /*dbg*/) const { return ReadImpl(offset, n, result, scratch); } -Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { - return Status::OK(); +IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return IOStatus::OK(); } size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { @@ -734,27 +733,26 @@ size_t WinRandomAccessFile::GetRequiredBufferAlignment() const { // WinWritableImpl // -inline -Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { - return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve); +inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), + spaceToReserve); } -inline -WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment) - : file_data_(file_data), - alignment_(alignment), - next_write_offset_(0), - reservedsize_(0) { - +inline WinWritableImpl::WinWritableImpl(WinFileData* file_data, + size_t alignment) + : file_data_(file_data), + alignment_(alignment), + next_write_offset_(0), + reservedsize_(0) { // Query current position in case ReopenWritableFile is called // This position is only important for buffered writes // for unbuffered writes we explicitely specify the position. LARGE_INTEGER zero_move; - zero_move.QuadPart = 0; // Do not move + zero_move.QuadPart = 0; // Do not move LARGE_INTEGER pos; pos.QuadPart = 0; BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos, - FILE_CURRENT); + FILE_CURRENT); // Querying no supped to fail if (ret != 0) { next_write_offset_ = pos.QuadPart; @@ -763,17 +761,15 @@ WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment) } } -inline -Status WinWritableImpl::AppendImpl(const Slice& data) { - - Status s; +inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) { + IOStatus s; if (data.size() > std::numeric_limits::max()) { - return Status::InvalidArgument("data is too long for a single write" + - file_data_->GetName()); + return IOStatus::InvalidArgument("data is too long for a single write" + + file_data_->GetName()); } - size_t bytes_written = 0; // out param + size_t bytes_written = 0; // out param if (file_data_->use_direct_io()) { // With no offset specified we are appending @@ -781,56 +777,53 @@ Status WinWritableImpl::AppendImpl(const Slice& data) { assert(IsSectorAligned(next_write_offset_)); if (!IsSectorAligned(data.size()) || !IsAligned(static_cast(GetAlignement()), data.data())) { - s = Status::InvalidArgument( - "WriteData must be page aligned, size must be sector aligned"); + s = IOStatus::InvalidArgument( + "WriteData must be page aligned, size must be sector aligned"); } else { s = pwrite(file_data_, data, next_write_offset_, bytes_written); } } else { - DWORD bytesWritten = 0; if (!WriteFile(file_data_->GetFileHandle(), data.data(), - static_cast(data.size()), &bytesWritten, NULL)) { + static_cast(data.size()), &bytesWritten, NULL)) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "Failed to WriteFile: " + file_data_->GetName(), - lastError); + "Failed to WriteFile: " + file_data_->GetName(), lastError); } else { bytes_written = bytesWritten; } } - if(s.ok()) { + if (s.ok()) { if (bytes_written == data.size()) { // This matters for direct_io cases where // we rely on the fact that next_write_offset_ // is sector aligned next_write_offset_ += bytes_written; } else { - s = Status::IOError("Failed to write all bytes: " + - file_data_->GetName()); + s = IOStatus::IOError("Failed to write all bytes: " + + file_data_->GetName()); } } return s; } -inline -Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) { - - if(file_data_->use_direct_io()) { +inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data, + uint64_t offset) { + if (file_data_->use_direct_io()) { if (!IsSectorAligned(static_cast(offset)) || !IsSectorAligned(data.size()) || !IsAligned(static_cast(GetAlignement()), data.data())) { - return Status::InvalidArgument( - "Data and offset must be page aligned, size must be sector aligned"); + return IOStatus::InvalidArgument( + "Data and offset must be page aligned, size must be sector aligned"); } } size_t bytes_written = 0; - Status s = pwrite(file_data_, data, offset, bytes_written); + IOStatus s = pwrite(file_data_, data, offset, bytes_written); - if(s.ok()) { + if (s.ok()) { if (bytes_written == data.size()) { // For sequential write this would be simple // size extension by data.size() @@ -839,23 +832,21 @@ Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) next_write_offset_ = write_end; } } else { - s = Status::IOError("Failed to write all of the requested data: " + - file_data_->GetName()); + s = IOStatus::IOError("Failed to write all of the requested data: " + + file_data_->GetName()); } } return s; } -inline -Status WinWritableImpl::TruncateImpl(uint64_t size) { - +inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) { // It is tempting to check for the size for sector alignment // but truncation may come at the end and there is not a requirement // for this to be sector aligned so long as we do not attempt to write // after that. The interface docs state that the behavior is undefined // in that case. - Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), - size); + IOStatus s = + ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size); if (s.ok()) { next_write_offset_ = size; @@ -863,50 +854,48 @@ Status WinWritableImpl::TruncateImpl(uint64_t size) { return s; } -inline -Status WinWritableImpl::CloseImpl() { - - Status s; +inline IOStatus WinWritableImpl::CloseImpl() { + IOStatus s; auto hFile = file_data_->GetFileHandle(); assert(INVALID_HANDLE_VALUE != hFile); if (!::FlushFileBuffers(hFile)) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " + - file_data_->GetName(), - lastError); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Close() for: " + file_data_->GetName(), + lastError); } - if(!file_data_->CloseFile() && s.ok()) { + if (!file_data_->CloseFile() && s.ok()) { auto lastError = GetLastError(); - s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(), - lastError); + s = IOErrorFromWindowsError( + "CloseHandle failed for: " + file_data_->GetName(), lastError); } return s; } -inline -Status WinWritableImpl::SyncImpl() { - Status s; - if (!::FlushFileBuffers (file_data_->GetFileHandle())) { +inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + IOStatus s; + if (!::FlushFileBuffers(file_data_->GetFileHandle())) { auto lastError = GetLastError(); s = IOErrorFromWindowsError( - "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError); + "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), + lastError); } return s; } - -inline -Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { - Status status; - TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); +inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { + IOStatus status; + TEST_KILL_RANDOM("WinWritableFile::Allocate"); // Make sure that we reserve an aligned amount of space // since the reservation block size is driven outside so we want // to check if we are ok with reservation here - size_t spaceToReserve = Roundup(static_cast(offset + len), static_cast(alignment_)); + size_t spaceToReserve = Roundup(static_cast(offset + len), + static_cast(alignment_)); // Nothing to do if (spaceToReserve <= reservedsize_) { return status; @@ -920,66 +909,78 @@ Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { return status; } - //////////////////////////////////////////////////////////////////////////////// /// WinWritableFile WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, size_t /* capacity */, - const EnvOptions& options) + const FileOptions& options) : WinFileData(fname, hFile, options.use_direct_writes), WinWritableImpl(this, alignment), - WritableFile(options) { + FSWritableFile(options) { assert(!options.use_mmap_writes); } -WinWritableFile::~WinWritableFile() { -} +WinWritableFile::~WinWritableFile() {} // Indicates if the class makes use of direct I/O -bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); } +bool WinWritableFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} size_t WinWritableFile::GetRequiredBufferAlignment() const { return static_cast(GetAlignement()); } -Status WinWritableFile::Append(const Slice& data) { +IOStatus WinWritableFile::Append(const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return AppendImpl(data); } -Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { +IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return PositionedAppendImpl(data, offset); } // Need to implement this so the file is truncated correctly // when buffered and unbuffered mode -Status WinWritableFile::Truncate(uint64_t size) { +IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return TruncateImpl(size); } -Status WinWritableFile::Close() { +IOStatus WinWritableFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return CloseImpl(); } - // write out the cached data to the OS cache - // This is now taken care of the WritableFileWriter -Status WinWritableFile::Flush() { - return Status::OK(); +// write out the cached data to the OS cache +// This is now taken care of the WritableFileWriter +IOStatus WinWritableFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); } -Status WinWritableFile::Sync() { - return SyncImpl(); +IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); } -Status WinWritableFile::Fsync() { return SyncImpl(); } +IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); +} bool WinWritableFile::IsSyncThreadSafe() const { return true; } -uint64_t WinWritableFile::GetFileSize() { +uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return GetFileNextWriteOffset(); } -Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) { +IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return AllocateImpl(offset, len); } @@ -991,36 +992,43 @@ size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const { /// WinRandomRWFile WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, - size_t alignment, const EnvOptions& options) + size_t alignment, const FileOptions& options) : WinFileData(fname, hFile, options.use_direct_reads && options.use_direct_writes), WinRandomAccessImpl(this, alignment, options), WinWritableImpl(this, alignment) {} -bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); } +bool WinRandomRWFile::use_direct_io() const { + return WinFileData::use_direct_io(); +} size_t WinRandomRWFile::GetRequiredBufferAlignment() const { return static_cast(GetAlignement()); } -Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) { +IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return PositionedAppendImpl(data, offset); } -Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { +IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n, + const IOOptions& /*options*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) const { return ReadImpl(offset, n, result, scratch); } -Status WinRandomRWFile::Flush() { - return Status::OK(); +IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); } -Status WinRandomRWFile::Sync() { - return SyncImpl(); +IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) { + return SyncImpl(options, dbg); } -Status WinRandomRWFile::Close() { +IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { return CloseImpl(); } @@ -1029,9 +1037,9 @@ Status WinRandomRWFile::Close() { WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { BOOL ret #if defined(_MSC_VER) - = FALSE; + = FALSE; #else - __attribute__((__unused__)); + __attribute__((__unused__)); #endif if (base_ != nullptr) { ret = ::UnmapViewOfFile(base_); @@ -1053,7 +1061,10 @@ WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { ////////////////////////////////////////////////////////////////////////// /// WinDirectory -Status WinDirectory::Fsync() { return Status::OK(); } +IOStatus WinDirectory::Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) { + return IOStatus::OK(); +} size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { return GetUniqueIdFromFile(handle_, id, max_size); @@ -1067,7 +1078,7 @@ WinFileLock::~WinFileLock() { assert(ret); } -} +} // namespace port } // namespace ROCKSDB_NAMESPACE #endif diff --git a/port/win/io_win.h b/port/win/io_win.h index d7aa7b48397..4119f5add2c 100644 --- a/port/win/io_win.h +++ b/port/win/io_win.h @@ -9,51 +9,53 @@ #pragma once #include +#include + #include #include +#include "rocksdb/file_system.h" #include "rocksdb/status.h" -#include "rocksdb/env.h" #include "util/aligned_buffer.h" - -#include +#include "util/string_util.h" namespace ROCKSDB_NAMESPACE { namespace port { std::string GetWindowsErrSz(DWORD err); -inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { +inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) { return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) - ? Status::NoSpace(context, GetWindowsErrSz(err)) + ? IOStatus::NoSpace(context, GetWindowsErrSz(err)) : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) - ? Status::PathNotFound(context, GetWindowsErrSz(err)) - : Status::IOError(context, GetWindowsErrSz(err)); + ? IOStatus::PathNotFound(context, GetWindowsErrSz(err)) + : IOStatus::IOError(context, GetWindowsErrSz(err)); } -inline Status IOErrorFromLastWindowsError(const std::string& context) { +inline IOStatus IOErrorFromLastWindowsError(const std::string& context) { return IOErrorFromWindowsError(context, GetLastError()); } -inline Status IOError(const std::string& context, int err_number) { +inline IOStatus IOError(const std::string& context, int err_number) { return (err_number == ENOSPC) - ? Status::NoSpace(context, strerror(err_number)) + ? IOStatus::NoSpace(context, errnoStr(err_number).c_str()) : (err_number == ENOENT) - ? Status::PathNotFound(context, strerror(err_number)) - : Status::IOError(context, strerror(err_number)); + ? IOStatus::PathNotFound(context, + errnoStr(err_number).c_str()) + : IOStatus::IOError(context, errnoStr(err_number).c_str()); } class WinFileData; -Status pwrite(const WinFileData* file_data, const Slice& data, - uint64_t offset, size_t& bytes_written); +IOStatus pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written); -Status pread(const WinFileData* file_data, char* src, size_t num_bytes, - uint64_t offset, size_t& bytes_read); +IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read); -Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); +IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); -Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); +IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); @@ -95,34 +97,38 @@ class WinFileData { WinFileData& operator=(const WinFileData&) = delete; }; -class WinSequentialFile : protected WinFileData, public SequentialFile { - +class WinSequentialFile : protected WinFileData, public FSSequentialFile { // Override for behavior change when creating a custom env - virtual Status PositionedReadInternal(char* src, size_t numBytes, - uint64_t offset, size_t& bytes_read) const; + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; -public: + public: WinSequentialFile(const std::string& fname, HANDLE f, - const EnvOptions& options); + const FileOptions& options); ~WinSequentialFile(); WinSequentialFile(const WinSequentialFile&) = delete; WinSequentialFile& operator=(const WinSequentialFile&) = delete; - virtual Status Read(size_t n, Slice* result, char* scratch) override; - virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, - char* scratch) override; + IOStatus Read(size_t n, const IOOptions& options, Slice* result, + char* scratch, IODebugContext* dbg) override; + IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) override; - virtual Status Skip(uint64_t n) override; + IOStatus Skip(uint64_t n) override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } }; // mmap() based random-access -class WinMmapReadableFile : private WinFileData, public RandomAccessFile { +class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile { HANDLE hMap_; const void* mapped_region_; @@ -138,10 +144,11 @@ class WinMmapReadableFile : private WinFileData, public RandomAccessFile { WinMmapReadableFile(const WinMmapReadableFile&) = delete; WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + virtual IOStatus InvalidateCache(size_t offset, size_t length) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -150,7 +157,7 @@ class WinMmapReadableFile : private WinFileData, public RandomAccessFile { // data to the file. This is safe since we either properly close the // file before reading from it, or for log files, the reading code // knows enough to skip zero suffixes. -class WinMmapFile : private WinFileData, public WritableFile { +class WinMmapFile : private WinFileData, public FSWritableFile { private: HANDLE hMap_; @@ -179,51 +186,59 @@ class WinMmapFile : private WinFileData, public WritableFile { // Can only truncate or reserve to a sector size aligned if // used on files that are opened with Unbuffered I/O - Status TruncateFile(uint64_t toSize); + IOStatus TruncateFile(uint64_t toSize); - Status UnmapCurrentRegion(); + IOStatus UnmapCurrentRegion(); - Status MapNewRegion(); + IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg); - virtual Status PreallocateInternal(uint64_t spaceToReserve); + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); public: WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, - size_t allocation_granularity, const EnvOptions& options); + size_t allocation_granularity, const FileOptions& options); ~WinMmapFile(); WinMmapFile(const WinMmapFile&) = delete; WinMmapFile& operator=(const WinMmapFile&) = delete; - virtual Status Append(const Slice& data) override; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } // Means Close() will properly take care of truncate // and it does not need any additional information - virtual Status Truncate(uint64_t size) override; + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; // Flush only data - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; /** - * Flush data as well as metadata to stable storage. - */ - virtual Status Fsync() override; + * Flush data as well as metadata to stable storage. + */ + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; /** - * Get the size of valid data in the file. This will not match the - * size that is returned from the filesystem because we use mmap - * to extend file by map_size every time. - */ - virtual uint64_t GetFileSize() override; + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; - virtual Status Allocate(uint64_t offset, uint64_t len) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -231,24 +246,24 @@ class WinMmapFile : private WinFileData, public WritableFile { class WinRandomAccessImpl { protected: WinFileData* file_base_; - size_t alignment_; + size_t alignment_; // Override for behavior change when creating a custom env - virtual Status PositionedReadInternal(char* src, size_t numBytes, - uint64_t offset, size_t& bytes_read) const; + virtual IOStatus PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, + size_t& bytes_read) const; WinRandomAccessImpl(WinFileData* file_base, size_t alignment, - const EnvOptions& options); + const FileOptions& options); virtual ~WinRandomAccessImpl() {} - Status ReadImpl(uint64_t offset, size_t n, Slice* result, - char* scratch) const; + IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const; size_t GetAlignment() const { return alignment_; } public: - WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; }; @@ -258,21 +273,24 @@ class WinRandomAccessFile : private WinFileData, protected WinRandomAccessImpl, // Want to be able to override // PositionedReadInternal - public RandomAccessFile { + public FSRandomAccessFile { public: WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options); + const FileOptions& options); ~WinRandomAccessFile(); - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; - virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } + virtual bool use_direct_io() const override { + return WinFileData::use_direct_io(); + } - virtual Status InvalidateCache(size_t offset, size_t length) override; + IOStatus InvalidateCache(size_t offset, size_t length) override; virtual size_t GetRequiredBufferAlignment() const override; }; @@ -293,10 +311,11 @@ class WinWritableImpl { protected: WinFileData* file_data_; const uint64_t alignment_; - uint64_t next_write_offset_; // Needed because Windows does not support O_APPEND + uint64_t + next_write_offset_; // Needed because Windows does not support O_APPEND uint64_t reservedsize_; // how far we have reserved space - virtual Status PreallocateInternal(uint64_t spaceToReserve); + virtual IOStatus PreallocateInternal(uint64_t spaceToReserve); WinWritableImpl(WinFileData* file_data, size_t alignment); @@ -304,17 +323,17 @@ class WinWritableImpl { uint64_t GetAlignement() const { return alignment_; } - Status AppendImpl(const Slice& data); + IOStatus AppendImpl(const Slice& data); // Requires that the data is aligned as specified by // GetRequiredBufferAlignment() - Status PositionedAppendImpl(const Slice& data, uint64_t offset); + IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset); - Status TruncateImpl(uint64_t size); + IOStatus TruncateImpl(uint64_t size); - Status CloseImpl(); + IOStatus CloseImpl(); - Status SyncImpl(); + IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg); uint64_t GetFileNextWriteOffset() { // Double accounting now here with WritableFileWriter @@ -326,7 +345,7 @@ class WinWritableImpl { return next_write_offset_; } - Status AllocateImpl(uint64_t offset, uint64_t len); + IOStatus AllocateImpl(uint64_t offset, uint64_t len); public: WinWritableImpl(const WinWritableImpl&) = delete; @@ -335,32 +354,47 @@ class WinWritableImpl { class WinWritableFile : private WinFileData, protected WinWritableImpl, - public WritableFile { + public FSWritableFile { public: WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, - size_t capacity, const EnvOptions& options); + size_t capacity, const FileOptions& options); ~WinWritableFile(); - virtual Status Append(const Slice& data) override; + IOStatus Append(const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; + IOStatus Append(const Slice& data, const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, opts, dbg); + } // Requires that the data is aligned as specified by // GetRequiredBufferAlignment() - virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& options, + IODebugContext* dbg) override; + IOStatus PositionedAppend(const Slice& data, uint64_t offset, + const IOOptions& opts, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return PositionedAppend(data, offset, opts, dbg); + } // Need to implement this so the file is truncated correctly // when buffered and unbuffered mode - virtual Status Truncate(uint64_t size) override; + IOStatus Truncate(uint64_t size, const IOOptions& options, + IODebugContext* dbg) override; - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; // write out the cached data to the OS cache // This is now taken care of the WritableFileWriter - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Fsync() override; + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; virtual bool IsSyncThreadSafe() const override; @@ -370,9 +404,10 @@ class WinWritableFile : private WinFileData, virtual size_t GetRequiredBufferAlignment() const override; - virtual uint64_t GetFileSize() override; + uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Allocate(uint64_t offset, uint64_t len) override; + IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, + IODebugContext* dbg) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -380,10 +415,10 @@ class WinWritableFile : private WinFileData, class WinRandomRWFile : private WinFileData, protected WinRandomAccessImpl, protected WinWritableImpl, - public RandomRWFile { + public FSRandomRWFile { public: WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, - const EnvOptions& options); + const FileOptions& options); ~WinRandomRWFile() {} @@ -397,45 +432,50 @@ class WinRandomRWFile : private WinFileData, // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. - virtual Status Write(uint64_t offset, const Slice& data) override; + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, + IODebugContext* dbg) override; // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. // Returns Status::OK() on success. - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override; + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override; - virtual Status Flush() override; + IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Sync() override; + IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override; - virtual Status Fsync() override { return Sync(); } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override { + return Sync(options, dbg); + } - virtual Status Close() override; + IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; }; class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { -private: - HANDLE file_handle_; - HANDLE map_handle_; -public: - WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) : - MemoryMappedFileBuffer(base, size), - file_handle_(file_handle), - map_handle_(map_handle) {} + private: + HANDLE file_handle_; + HANDLE map_handle_; + + public: + WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, + size_t size) + : MemoryMappedFileBuffer(base, size), + file_handle_(file_handle), + map_handle_(map_handle) {} ~WinMemoryMappedBuffer() override; }; -class WinDirectory : public Directory { +class WinDirectory : public FSDirectory { HANDLE handle_; + public: explicit WinDirectory(HANDLE h) noexcept : handle_(h) { assert(handle_ != INVALID_HANDLE_VALUE); } - ~WinDirectory() { - ::CloseHandle(handle_); - } - virtual Status Fsync() override; + ~WinDirectory() { ::CloseHandle(handle_); } + IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override; size_t GetUniqueId(char* id, size_t max_size) const override; }; @@ -452,5 +492,5 @@ class WinFileLock : public FileLock { private: HANDLE hFile_; }; -} +} // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/win/port_win.h b/port/win/port_win.h index 2c5b8ff0533..a6a6de27801 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -355,6 +355,7 @@ extern void SetCpuPriority(ThreadId id, CpuPriority priority); #define RX_FILESTRING std::wstring #define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a) #define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a) +#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str()) #define RX_FNLEN(a) ::wcslen(a) #define RX_DeleteFile DeleteFileW @@ -379,6 +380,7 @@ extern void SetCpuPriority(ThreadId id, CpuPriority priority); #define RX_FILESTRING std::string #define RX_FN(a) a #define FN_TO_RX(a) a +#define RX_FNCMP(a, b) strcmp(a, b) #define RX_FNLEN(a) strlen(a) #define RX_DeleteFile DeleteFileA @@ -388,7 +390,7 @@ extern void SetCpuPriority(ThreadId id, CpuPriority priority); #define RX_FindFirstFileEx FindFirstFileExA #define RX_CreateDirectory CreateDirectoryA #define RX_FindNextFile FindNextFileA -#define RX_WIN32_FIND_DATA WIN32_FIND_DATA +#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA #define RX_CreateDirectory CreateDirectoryA #define RX_RemoveDirectory RemoveDirectoryA #define RX_GetFileAttributesEx GetFileAttributesExA diff --git a/port/win/win_logger.cc b/port/win/win_logger.cc index a9b10c04b8c..a45f3c6d439 100644 --- a/port/win/win_logger.cc +++ b/port/win/win_logger.cc @@ -13,31 +13,33 @@ #if defined(OS_WIN) #include "port/win/win_logger.h" -#include "port/win/io_win.h" -#include +#include #include #include -#include -#include -#include "rocksdb/env.h" +#include +#include #include "monitoring/iostats_context_imp.h" #include "port/sys_time.h" +#include "port/win/env_win.h" +#include "port/win/io_win.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { namespace port { -WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, +WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, const InfoLogLevel log_level) : Logger(log_level), file_(file), gettid_(gettid), log_size_(0), last_flush_micros_(0), - env_(env), + clock_(clock), flush_pending_(false) { assert(file_ != NULL); assert(file_ != INVALID_HANDLE_VALUE); @@ -53,7 +55,7 @@ void WinLogger::DebugWriter(const char* str, int len) { } } -WinLogger::~WinLogger() { CloseInternal(); } +WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); } Status WinLogger::CloseImpl() { return CloseInternal(); @@ -88,7 +90,7 @@ void WinLogger::Flush() { // for perf reasons. } - last_flush_micros_ = env_->NowMicros(); + last_flush_micros_ = clock_->NowMicros(); } void WinLogger::Logv(const char* format, va_list ap) { diff --git a/port/win/win_logger.h b/port/win/win_logger.h index 116e7898db6..809c7d5a2bb 100644 --- a/port/win/win_logger.h +++ b/port/win/win_logger.h @@ -12,22 +12,21 @@ #pragma once +#include +#include + #include +#include #include "rocksdb/env.h" -#include -#include - namespace ROCKSDB_NAMESPACE { - -class Env; +class SystemClock; namespace port { - class WinLogger : public ROCKSDB_NAMESPACE::Logger { public: - WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file, + WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file, const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL); virtual ~WinLogger(); @@ -54,7 +53,7 @@ class WinLogger : public ROCKSDB_NAMESPACE::Logger { uint64_t (*gettid_)(); // Return the thread id for the current thread std::atomic_size_t log_size_; std::atomic_uint_fast64_t last_flush_micros_; - Env* env_; + SystemClock* clock_; bool flush_pending_; Status CloseInternal(); diff --git a/src.mk b/src.mk index fe43adc46fa..70f657de4de 100644 --- a/src.mk +++ b/src.mk @@ -1,6 +1,7 @@ # These are the sources from which librocksdb.a is built: LIB_SOURCES = \ cache/cache.cc \ + cache/cache_entry_roles.cc \ cache/clock_cache.cc \ cache/lru_cache.cc \ cache/sharded_cache.cc \ @@ -21,8 +22,10 @@ LIB_SOURCES = \ cloud/cloud_storage_provider.cc \ cloud/cloud_file_cache.cc \ db/arena_wrapped_db_iter.cc \ + db/blob/blob_fetcher.cc \ db/blob/blob_file_addition.cc \ db/blob/blob_file_builder.cc \ + db/blob/blob_file_cache.cc \ db/blob/blob_file_garbage.cc \ db/blob/blob_file_meta.cc \ db/blob/blob_file_reader.cc \ @@ -32,7 +35,6 @@ LIB_SOURCES = \ db/builder.cc \ db/c.cc \ db/column_family.cc \ - db/compacted_db_impl.cc \ db/compaction/compaction.cc \ db/compaction/compaction_iterator.cc \ db/compaction/compaction_job.cc \ @@ -43,6 +45,7 @@ LIB_SOURCES = \ db/compaction/sst_partitioner.cc \ db/convenience.cc \ db/db_filesnapshot.cc \ + db/db_impl/compacted_db_impl.cc \ db/db_impl/db_impl.cc \ db/db_impl/db_impl_compaction_flush.cc \ db/db_impl/db_impl_debug.cc \ @@ -94,6 +97,7 @@ LIB_SOURCES = \ db/write_batch_base.cc \ db/write_controller.cc \ db/write_thread.cc \ + env/composite_env.cc \ env/env.cc \ env/env_chroot.cc \ env/env_encryption.cc \ @@ -101,6 +105,7 @@ LIB_SOURCES = \ env/env_posix.cc \ env/file_system.cc \ env/fs_posix.cc \ + env/fs_remap.cc \ env/file_system_tracer.cc \ env/io_posix.cc \ env/mock_env.cc \ @@ -108,6 +113,7 @@ LIB_SOURCES = \ file/file_prefetch_buffer.cc \ file/file_util.cc \ file/filename.cc \ + file/line_file_reader.cc \ file/random_access_file_reader.cc \ file/read_write_util.cc \ file/readahead_raf.cc \ @@ -143,6 +149,7 @@ LIB_SOURCES = \ monitoring/thread_status_util_debug.cc \ options/cf_options.cc \ options/configurable.cc \ + options/customizable.cc \ options/db_options.cc \ options/options.cc \ options/options_helper.cc \ @@ -217,11 +224,13 @@ LIB_SOURCES = \ util/compression_context_cache.cc \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ + util/crc32c_arm64.cc \ util/dynamic_bloom.cc \ util/hash.cc \ util/murmurhash.cc \ util/random.cc \ util/rate_limiter.cc \ + util/ribbon_config.cc \ util/slice.cc \ util/file_checksum_helper.cc \ util/status.cc \ @@ -267,8 +276,9 @@ LIB_SOURCES = \ utilities/simulator_cache/sim_cache.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/trace/file_trace_reader_writer.cc \ - utilities/transactions/lock/lock_tracker.cc \ - utilities/transactions/lock/point_lock_tracker.cc \ + utilities/transactions/lock/lock_manager.cc \ + utilities/transactions/lock/point/point_lock_tracker.cc \ + utilities/transactions/lock/point/point_lock_manager.cc \ utilities/transactions/optimistic_transaction.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \ utilities/transactions/pessimistic_transaction.cc \ @@ -276,7 +286,6 @@ LIB_SOURCES = \ utilities/transactions/snapshot_checker.cc \ utilities/transactions/transaction_base.cc \ utilities/transactions/transaction_db_mutex_impl.cc \ - utilities/transactions/transaction_lock_mgr.cc \ utilities/transactions/transaction_util.cc \ utilities/transactions/write_prepared_txn.cc \ utilities/transactions/write_prepared_txn_db.cc \ @@ -286,11 +295,6 @@ LIB_SOURCES = \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ -ifeq ($(ARMCRC_SOURCE),1) -LIB_SOURCES +=\ - util/crc32c_arm64.cc -endif - ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1)) LIB_SOURCES_ASM =\ util/crc32c_ppc_asm.S @@ -301,6 +305,22 @@ LIB_SOURCES_ASM = LIB_SOURCES_C = endif +RANGE_TREE_SOURCES =\ + utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc \ + utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \ + utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \ + utilities/transactions/lock/range/range_tree/lib/util/memarena.cc \ + utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc \ + utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc + TOOL_LIB_SOURCES = \ tools/io_tracer_parser_tool.cc \ tools/ldb_cmd.cc \ @@ -317,6 +337,10 @@ MOCK_LIB_SOURCES = \ BENCH_LIB_SOURCES = \ tools/db_bench_tool.cc \ + tools/simulated_hybrid_file_system.cc \ + +CACHE_BENCH_LIB_SOURCES = \ + cache/cache_bench_tool.cc \ STRESS_LIB_SOURCES = \ db_stress_tool/batched_ops_stress.cc \ @@ -378,15 +402,21 @@ TEST_MAIN_SOURCES = \ cloud/remote_compaction_test.cc \ db/blob/blob_file_addition_test.cc \ db/blob/blob_file_builder_test.cc \ + db/blob/blob_file_cache_test.cc \ db/blob/blob_file_garbage_test.cc \ db/blob/blob_file_reader_test.cc \ + db/blob/db_blob_basic_test.cc \ + db/blob/db_blob_compaction_test.cc \ + db/blob/db_blob_corruption_test.cc \ db/blob/db_blob_index_test.cc \ db/column_family_test.cc \ db/compact_files_test.cc \ + db/compaction/clipping_iterator_test.cc \ db/compaction/compaction_iterator_test.cc \ db/compaction/compaction_job_test.cc \ db/compaction/compaction_job_stats_test.cc \ db/compaction/compaction_picker_test.cc \ + db/compaction/compaction_service_test.cc \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \ @@ -405,6 +435,7 @@ TEST_MAIN_SOURCES = \ db/db_iter_test.cc \ db/db_iter_stress_test.cc \ db/db_iterator_test.cc \ + db/db_kv_checksum_test.cc \ db/db_log_iter_test.cc \ db/db_memtable_test.cc \ db/db_merge_operator_test.cc \ @@ -412,7 +443,7 @@ TEST_MAIN_SOURCES = \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ - db/db_impl/db_secondary_test.cc \ + db/db_secondary_test.cc \ db/db_sst_test.cc \ db/db_statistics_test.cc \ db/db_table_properties_test.cc \ @@ -423,6 +454,7 @@ TEST_MAIN_SOURCES = \ db/db_universal_compaction_test.cc \ db/db_wal_test.cc \ db/db_with_timestamp_compaction_test.cc \ + db/db_write_buffer_manager_test.cc \ db/db_write_test.cc \ db/dbformat_test.cc \ db/deletefile_test.cc \ @@ -431,7 +463,6 @@ TEST_MAIN_SOURCES = \ db/external_sst_file_test.cc \ db/fault_injection_test.cc \ db/file_indexer_test.cc \ - db/file_reader_writer_test.cc \ db/filename_test.cc \ db/flush_job_test.cc \ db/listener_test.cc \ @@ -477,6 +508,7 @@ TEST_MAIN_SOURCES = \ monitoring/statistics_test.cc \ monitoring/stats_history_test.cc \ options/configurable_test.cc \ + options/customizable_test.cc \ options/options_settable_test.cc \ options/options_test.cc \ table/block_based/block_based_filter_block_test.cc \ @@ -514,6 +546,7 @@ TEST_MAIN_SOURCES = \ util/random_test.cc \ util/rate_limiter_test.cc \ util/repeatable_thread_test.cc \ + util/ribbon_test.cc \ util/slice_test.cc \ util/slice_transform_test.cc \ util/timer_queue_test.cc \ @@ -540,8 +573,9 @@ TEST_MAIN_SOURCES = \ utilities/simulator_cache/sim_cache_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ + utilities/transactions/lock/range/range_locking_test.cc \ utilities/transactions/transaction_test.cc \ - utilities/transactions/transaction_lock_mgr_test.cc \ + utilities/transactions/lock/point/point_lock_manager_test.cc \ utilities/transactions/write_prepared_transaction_test.cc \ utilities/transactions/write_unprepared_transaction_test.cc \ utilities/ttl/ttl_test.cc \ @@ -557,6 +591,7 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/backupablejni.cc \ java/rocksjni/checkpoint.cc \ java/rocksjni/clock_cache.cc \ + java/rocksjni/cache.cc \ java/rocksjni/columnfamilyhandle.cc \ java/rocksjni/compact_range_options.cc \ java/rocksjni/compaction_filter.cc \ @@ -574,6 +609,8 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/config_options.cc \ java/rocksjni/env.cc \ java/rocksjni/env_options.cc \ + java/rocksjni/event_listener.cc \ + java/rocksjni/event_listener_jnicallback.cc \ java/rocksjni/ingest_external_file_options.cc \ java/rocksjni/filter.cc \ java/rocksjni/iterator.cc \ @@ -620,6 +657,7 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/transaction_notifier.cc \ java/rocksjni/transaction_notifier_jnicallback.cc \ java/rocksjni/ttl.cc \ + java/rocksjni/testable_event_listener.cc \ java/rocksjni/wal_filter.cc \ java/rocksjni/wal_filter_jnicallback.cc \ java/rocksjni/write_batch.cc \ diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc index 480c4c9a600..63333b1b333 100644 --- a/table/adaptive/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -71,10 +71,9 @@ Status AdaptiveTableFactory::NewTableReader( } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { - return table_factory_to_write_->NewTableBuilder(table_builder_options, - column_family_id, file); + return table_factory_to_write_->NewTableBuilder(table_builder_options, file); } std::string AdaptiveTableFactory::GetPrintableOptions() const { diff --git a/table/adaptive/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h index cbc81868cc6..65f816fad83 100644 --- a/table/adaptive/adaptive_table_factory.h +++ b/table/adaptive/adaptive_table_factory.h @@ -42,7 +42,7 @@ class AdaptiveTableFactory : public TableFactory { TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; std::string GetPrintableOptions() const override; diff --git a/table/block_based/block.cc b/table/block_based/block.cc index bc481d57e53..2d32ebcb492 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -15,7 +15,6 @@ #include #include -#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc index 2e457e32f6e..13b3dcc448a 100644 --- a/table/block_based/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -68,7 +68,7 @@ BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( whole_key_filtering_(table_opt.whole_key_filtering), prev_prefix_start_(0), prev_prefix_size_(0), - num_added_(0) { + total_added_in_built_(0) { assert(policy_); } @@ -80,19 +80,22 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { } } -void BlockBasedFilterBlockBuilder::Add(const Slice& key) { - if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { - AddPrefix(key); +size_t BlockBasedFilterBlockBuilder::EstimateEntriesAdded() { + return total_added_in_built_ + start_.size(); +} + +void BlockBasedFilterBlockBuilder::Add(const Slice& key_without_ts) { + if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) { + AddPrefix(key_without_ts); } if (whole_key_filtering_) { - AddKey(key); + AddKey(key_without_ts); } } // Add key to filter if needed inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { - num_added_++; start_.push_back(entries_.size()); entries_.append(key.data(), key.size()); } @@ -118,6 +121,7 @@ Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, Status* status) { // In this impl we ignore BlockHandle *status = Status::OK(); + if (!start_.empty()) { GenerateFilter(); } @@ -140,6 +144,7 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { filter_offsets_.push_back(static_cast(result_.size())); return; } + total_added_in_built_ += num_entries; // Make list of keys from flattened key structure start_.push_back(entries_.size()); // Simplify length computation diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h index 67ded1ee3b4..0b46cd7c1bf 100644 --- a/table/block_based/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -44,8 +44,11 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { virtual bool IsBlockBased() override { return true; } virtual void StartBlock(uint64_t block_offset) override; - virtual void Add(const Slice& key) override; - virtual size_t NumAdded() const override { return num_added_; } + virtual void Add(const Slice& key_without_ts) override; + virtual bool IsEmpty() const override { + return start_.empty() && filter_offsets_.empty(); + } + virtual size_t EstimateEntriesAdded() override; virtual Slice Finish(const BlockHandle& tmp, Status* status) override; using FilterBlockBuilder::Finish; @@ -70,7 +73,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { std::string result_; // Filter data computed so far std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; - size_t num_added_; // Number of keys added + uint64_t total_added_in_built_; // Total keys added to filters built so far }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc index 283d6a9a2e4..862e90233e6 100644 --- a/table/block_based/block_based_filter_block_test.cc +++ b/table/block_based/block_based_filter_block_test.cc @@ -76,17 +76,26 @@ TEST_F(FilterBlockTest, EmptyBuilder) { TEST_F(FilterBlockTest, SingleChunk) { BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - ASSERT_EQ(0, builder.NumAdded()); + ASSERT_TRUE(builder.IsEmpty()); builder.StartBlock(100); builder.Add("foo"); + ASSERT_FALSE(builder.IsEmpty()); + builder.Add("bar"); builder.Add("bar"); builder.Add("box"); builder.StartBlock(200); builder.Add("box"); builder.StartBlock(300); builder.Add("hello"); - ASSERT_EQ(5, builder.NumAdded()); - Slice slice(builder.Finish()); + // XXX: "bar" should only count once but is counted twice. This actually + // indicates a serious space usage bug in old block-based filter. Good + // that it is deprecated. + // "box" counts twice, because it's in distinct blocks. + ASSERT_EQ(6, builder.EstimateEntriesAdded()); + ASSERT_FALSE(builder.IsEmpty()); + Status s; + Slice slice = builder.Finish(BlockHandle(), &s); + ASSERT_OK(s); CachableEntry block( new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 28bfbb7b23b..ac0d45cddb9 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -11,25 +11,26 @@ #include #include + #include #include #include #include +#include #include #include #include #include "db/dbformat.h" #include "index_builder.h" -#include "port/lang.h" - +#include "memory/memory_allocator.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" - #include "table/block_based/block.h" #include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" @@ -41,8 +42,6 @@ #include "table/block_based/partitioned_filter_block.h" #include "table/format.h" #include "table/table_builder.h" - -#include "memory/memory_allocator.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" @@ -56,7 +55,6 @@ namespace ROCKSDB_NAMESPACE { extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; -typedef BlockBasedTableOptions::IndexType IndexType; // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace { @@ -68,7 +66,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder( const bool use_delta_encoding_for_index_values, PartitionedIndexBuilder* const p_index_builder) { const BlockBasedTableOptions& table_opt = context.table_options; - if (table_opt.filter_policy == nullptr) return nullptr; + assert(table_opt.filter_policy); // precondition FilterBitsBuilder* filter_bits_builder = BloomFilterPolicy::GetBuilderFromContext(context); @@ -79,8 +77,9 @@ FilterBlockBuilder* CreateFilterBlockBuilder( if (table_opt.partition_filters) { assert(p_index_builder != nullptr); // Since after partition cut request from filter builder it takes time - // until index builder actully cuts the partition, we take the lower bound - // as partition size. + // until index builder actully cuts the partition, until the end of a + // data block potentially with many keys, we take the lower bound as + // partition size. assert(table_opt.block_size_deviation <= 100); auto partition_size = static_cast(((table_opt.metadata_block_size * @@ -212,9 +211,9 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector return Status::OK(); } - virtual void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) override { + virtual void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Intentionally left blank. No interest in collecting stats for // blocks. return; @@ -248,24 +247,18 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector }; struct BlockBasedTableBuilder::Rep { - const ImmutableCFOptions ioptions; + const ImmutableOptions ioptions; const MutableCFOptions moptions; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; WritableFileWriter* file; std::atomic offset; - // Synchronize status & io_status accesses across threads from main thread, - // compression thread and write thread in parallel compression. - std::mutex status_mutex; size_t alignment; BlockBuilder data_block; - // Buffers uncompressed data blocks and keys to replay later. Needed when + // Buffers uncompressed data blocks to replay later. Needed when // compression dictionary is enabled so we can finalize the dictionary before // compressing any data blocks. - // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data - // blocks as it's redundant, but it's easier to implement for now. - std::vector>> - data_block_and_keys_buffers; + std::vector data_block_buffers; BlockBuilder range_del_block; InternalKeySliceTransform internal_prefix_transform; @@ -276,6 +269,11 @@ struct BlockBasedTableBuilder::Rep { const Slice* first_key_in_next_block = nullptr; CompressionType compression_type; uint64_t sample_for_compression; + std::atomic compressible_input_data_bytes; + std::atomic uncompressible_input_data_bytes; + std::atomic sampled_input_data_bytes; + std::atomic sampled_output_slow_data_bytes; + std::atomic sampled_output_fast_data_bytes; CompressionOptions compression_opts; std::unique_ptr compression_dict; std::vector> compression_ctxs; @@ -310,9 +308,14 @@ struct BlockBasedTableBuilder::Rep { kClosed, }; State state; + // `kBuffered` state is allowed only as long as the buffering of uncompressed + // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`. + uint64_t buffer_limit; const bool use_delta_encoding_for_index_values; std::unique_ptr filter_builder; + char cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; + size_t cache_key_prefix_size; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -320,17 +323,16 @@ struct BlockBasedTableBuilder::Rep { std::string compressed_output; std::unique_ptr flush_block_policy; - int level_at_creation; uint32_t column_family_id; - const std::string& column_family_name; + std::string column_family_name; uint64_t creation_time = 0; uint64_t oldest_key_time = 0; - const uint64_t target_file_size; uint64_t file_creation_time = 0; // DB IDs const std::string db_id; const std::string db_session_id; + std::string db_host_id; std::vector> table_properties_collectors; @@ -339,78 +341,72 @@ struct BlockBasedTableBuilder::Rep { uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } - const IOStatus& GetIOStatus() { - if (compression_opts.parallel_threads > 1) { - std::lock_guard lock(status_mutex); - return io_status; + bool IsParallelCompressionEnabled() const { + return compression_opts.parallel_threads > 1; + } + + Status GetStatus() { + // We need to make modifications of status visible when status_ok is set + // to false, and this is ensured by status_mutex, so no special memory + // order for status_ok is required. + if (status_ok.load(std::memory_order_relaxed)) { + return Status::OK(); } else { - return io_status; + return CopyStatus(); } } - const Status& GetStatus() { - if (compression_opts.parallel_threads > 1) { - std::lock_guard lock(status_mutex); - return status; + Status CopyStatus() { + std::lock_guard lock(status_mutex); + return status; + } + + IOStatus GetIOStatus() { + // We need to make modifications of io_status visible when status_ok is set + // to false, and this is ensured by io_status_mutex, so no special memory + // order for io_status_ok is required. + if (io_status_ok.load(std::memory_order_relaxed)) { + return IOStatus::OK(); } else { - return status; + return CopyIOStatus(); } } - void SyncStatusFromIOStatus() { - if (compression_opts.parallel_threads > 1) { - std::lock_guard lock(status_mutex); - if (status.ok()) { - status = io_status; - } - } else if (status.ok()) { - status = io_status; - } + IOStatus CopyIOStatus() { + std::lock_guard lock(io_status_mutex); + return io_status; } // Never erase an existing status that is not OK. void SetStatus(Status s) { - if (!s.ok()) { + if (!s.ok() && status_ok.load(std::memory_order_relaxed)) { // Locking is an overkill for non compression_opts.parallel_threads // case but since it's unlikely that s is not OK, we take this cost // to be simplicity. std::lock_guard lock(status_mutex); - if (status.ok()) { - status = s; - } + status = s; + status_ok.store(false, std::memory_order_relaxed); } } // Never erase an existing I/O status that is not OK. void SetIOStatus(IOStatus ios) { - if (!ios.ok()) { + if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) { // Locking is an overkill for non compression_opts.parallel_threads // case but since it's unlikely that s is not OK, we take this cost // to be simplicity. - std::lock_guard lock(status_mutex); - if (io_status.ok()) { - io_status = ios; - } + std::lock_guard lock(io_status_mutex); + io_status = ios; + io_status_ok.store(false, std::memory_order_relaxed); } } - Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, - const BlockBasedTableOptions& table_opt, - const InternalKeyComparator& icomparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t _column_family_id, WritableFileWriter* f, - const CompressionType _compression_type, - const uint64_t _sample_for_compression, - const CompressionOptions& _compression_opts, const bool skip_filters, - const int _level_at_creation, const std::string& _column_family_name, - const uint64_t _creation_time, const uint64_t _oldest_key_time, - const uint64_t _target_file_size, const uint64_t _file_creation_time, - const std::string& _db_id, const std::string& _db_session_id) - : ioptions(_ioptions), - moptions(_moptions), + Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo, + WritableFileWriter* f) + : ioptions(tbo.ioptions), + moptions(tbo.moptions), table_options(table_opt), - internal_comparator(icomparator), + internal_comparator(tbo.internal_comparator), file(f), offset(0), alignment(table_options.block_align @@ -419,37 +415,52 @@ struct BlockBasedTableBuilder::Rep { data_block(table_options.block_restart_interval, table_options.use_delta_encoding, false /* use_value_delta_encoding */, - icomparator.user_comparator() + tbo.internal_comparator.user_comparator() ->CanKeysWithDifferentByteContentsBeEqual() ? BlockBasedTableOptions::kDataBlockBinarySearch : table_options.data_block_index_type, table_options.data_block_hash_table_util_ratio), range_del_block(1 /* block_restart_interval */), - internal_prefix_transform(_moptions.prefix_extractor.get()), - compression_type(_compression_type), - sample_for_compression(_sample_for_compression), - compression_opts(_compression_opts), + internal_prefix_transform(tbo.moptions.prefix_extractor.get()), + compression_type(tbo.compression_type), + sample_for_compression(tbo.moptions.sample_for_compression), + compressible_input_data_bytes(0), + uncompressible_input_data_bytes(0), + sampled_input_data_bytes(0), + sampled_output_slow_data_bytes(0), + sampled_output_fast_data_bytes(0), + compression_opts(tbo.compression_opts), compression_dict(), - compression_ctxs(_compression_opts.parallel_threads), - verify_ctxs(_compression_opts.parallel_threads), + compression_ctxs(tbo.compression_opts.parallel_threads), + verify_ctxs(tbo.compression_opts.parallel_threads), verify_dict(), - state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered - : State::kUnbuffered), + state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered + : State::kUnbuffered), use_delta_encoding_for_index_values(table_opt.format_version >= 4 && !table_opt.block_align), + cache_key_prefix_size(0), compressed_cache_key_prefix_size(0), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)), - level_at_creation(_level_at_creation), - column_family_id(_column_family_id), - column_family_name(_column_family_name), - creation_time(_creation_time), - oldest_key_time(_oldest_key_time), - target_file_size(_target_file_size), - file_creation_time(_file_creation_time), - db_id(_db_id), - db_session_id(_db_session_id) { + column_family_id(tbo.column_family_id), + column_family_name(tbo.column_family_name), + creation_time(tbo.creation_time), + oldest_key_time(tbo.oldest_key_time), + file_creation_time(tbo.file_creation_time), + db_id(tbo.db_id), + db_session_id(tbo.db_session_id), + db_host_id(ioptions.db_host_id), + status_ok(true), + io_status_ok(true) { + if (tbo.target_file_size == 0) { + buffer_limit = compression_opts.max_dict_buffer_bytes; + } else if (compression_opts.max_dict_buffer_bytes == 0) { + buffer_limit = tbo.target_file_size; + } else { + buffer_limit = std::min(tbo.target_file_size, + compression_opts.max_dict_buffer_bytes); + } for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { compression_ctxs[i].reset(new CompressionContext(compression_type)); } @@ -465,39 +476,71 @@ struct BlockBasedTableBuilder::Rep { &this->internal_prefix_transform, use_delta_encoding_for_index_values, table_options)); } - if (skip_filters) { - filter_builder = nullptr; + if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) { + // Apply optimize_filters_for_hits setting here when applicable by + // skipping filter generation + filter_builder.reset(); + } else if (tbo.skip_filters) { + // For SstFileWriter skip_filters + filter_builder.reset(); + } else if (!table_options.filter_policy) { + // Null filter_policy -> no filter + filter_builder.reset(); } else { - FilterBuildingContext context(table_options); - context.column_family_name = column_family_name; - context.compaction_style = ioptions.compaction_style; - context.level_at_creation = level_at_creation; - context.info_log = ioptions.info_log; + FilterBuildingContext filter_context(table_options); + + filter_context.info_log = ioptions.logger; + filter_context.column_family_name = tbo.column_family_name; + filter_context.reason = tbo.reason; + + // Only populate other fields if known to be in LSM rather than + // generating external SST file + if (tbo.reason != TableFileCreationReason::kMisc) { + filter_context.compaction_style = ioptions.compaction_style; + filter_context.num_levels = ioptions.num_levels; + filter_context.level_at_creation = tbo.level_at_creation; + filter_context.is_bottommost = tbo.is_bottommost; + assert(filter_context.level_at_creation < filter_context.num_levels); + } + filter_builder.reset(CreateFilterBlockBuilder( - ioptions, moptions, context, use_delta_encoding_for_index_values, - p_index_builder_)); + ioptions, moptions, filter_context, + use_delta_encoding_for_index_values, p_index_builder_)); } - for (auto& collector_factories : *int_tbl_prop_collector_factories) { + const auto& factory_range = tbo.int_tbl_prop_collector_factories; + for (auto it = factory_range.first; it != factory_range.second; ++it) { + assert(*it); + table_properties_collectors.emplace_back( - collector_factories->CreateIntTblPropCollector(column_family_id)); + (*it)->CreateIntTblPropCollector(column_family_id)); } table_properties_collectors.emplace_back( new BlockBasedTablePropertiesCollector( table_options.index_type, table_options.whole_key_filtering, - _moptions.prefix_extractor != nullptr)); + moptions.prefix_extractor != nullptr)); if (table_options.verify_compression) { for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { verify_ctxs[i].reset(new UncompressionContext(compression_type)); } } + + if (!ReifyDbHostIdProperty(ioptions.env, &db_host_id).ok()) { + ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); + } } Rep(const Rep&) = delete; Rep& operator=(const Rep&) = delete; private: + // Synchronize status & io_status accesses across threads from main thread, + // compression thread and write thread in parallel compression. + std::mutex status_mutex; + std::atomic status_ok; Status status; + std::mutex io_status_mutex; + std::atomic io_status_ok; IOStatus io_status; }; @@ -593,41 +636,123 @@ struct BlockBasedTableBuilder::ParallelCompressionRep { WriteQueue write_queue; std::unique_ptr write_thread; - // Raw bytes compressed so far. - uint64_t raw_bytes_compressed; - // Size of current block being appended. - uint64_t raw_bytes_curr_block; - // Raw bytes under compression and not appended yet. - std::atomic raw_bytes_inflight; - // Number of blocks under compression and not appended yet. - std::atomic blocks_inflight; - // Current compression ratio, maintained by BGWorkWriteRawBlock. - std::atomic curr_compression_ratio; - // Estimated SST file size. - std::atomic estimated_file_size; - - // Wait for the completion of first block compression to get a - // non-zero compression ratio. - bool first_block; + // Estimate output file size when parallel compression is enabled. This is + // necessary because compression & flush are no longer synchronized, + // and BlockBasedTableBuilder::FileSize() is no longer accurate. + // memory_order_relaxed suffices because accurate statistics is not required. + class FileSizeEstimator { + public: + explicit FileSizeEstimator() + : raw_bytes_compressed(0), + raw_bytes_curr_block(0), + raw_bytes_curr_block_set(false), + raw_bytes_inflight(0), + blocks_inflight(0), + curr_compression_ratio(0), + estimated_file_size(0) {} + + // Estimate file size when a block is about to be emitted to + // compression thread + void EmitBlock(uint64_t raw_block_size, uint64_t curr_file_size) { + uint64_t new_raw_bytes_inflight = + raw_bytes_inflight.fetch_add(raw_block_size, + std::memory_order_relaxed) + + raw_block_size; + + uint64_t new_blocks_inflight = + blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1; + + estimated_file_size.store( + curr_file_size + + static_cast( + static_cast(new_raw_bytes_inflight) * + curr_compression_ratio.load(std::memory_order_relaxed)) + + new_blocks_inflight * kBlockTrailerSize, + std::memory_order_relaxed); + } + + // Estimate file size when a block is already reaped from + // compression thread + void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) { + assert(raw_bytes_curr_block_set); + + uint64_t new_raw_bytes_compressed = + raw_bytes_compressed + raw_bytes_curr_block; + assert(new_raw_bytes_compressed > 0); + + curr_compression_ratio.store( + (curr_compression_ratio.load(std::memory_order_relaxed) * + raw_bytes_compressed + + compressed_block_size) / + static_cast(new_raw_bytes_compressed), + std::memory_order_relaxed); + raw_bytes_compressed = new_raw_bytes_compressed; + + uint64_t new_raw_bytes_inflight = + raw_bytes_inflight.fetch_sub(raw_bytes_curr_block, + std::memory_order_relaxed) - + raw_bytes_curr_block; + + uint64_t new_blocks_inflight = + blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1; + + estimated_file_size.store( + curr_file_size + + static_cast( + static_cast(new_raw_bytes_inflight) * + curr_compression_ratio.load(std::memory_order_relaxed)) + + new_blocks_inflight * kBlockTrailerSize, + std::memory_order_relaxed); + + raw_bytes_curr_block_set = false; + } + + void SetEstimatedFileSize(uint64_t size) { + estimated_file_size.store(size, std::memory_order_relaxed); + } + + uint64_t GetEstimatedFileSize() { + return estimated_file_size.load(std::memory_order_relaxed); + } + + void SetCurrBlockRawSize(uint64_t size) { + raw_bytes_curr_block = size; + raw_bytes_curr_block_set = true; + } + + private: + // Raw bytes compressed so far. + uint64_t raw_bytes_compressed; + // Size of current block being appended. + uint64_t raw_bytes_curr_block; + // Whether raw_bytes_curr_block has been set for next + // ReapBlock call. + bool raw_bytes_curr_block_set; + // Raw bytes under compression and not appended yet. + std::atomic raw_bytes_inflight; + // Number of blocks under compression and not appended yet. + std::atomic blocks_inflight; + // Current compression ratio, maintained by BGWorkWriteRawBlock. + std::atomic curr_compression_ratio; + // Estimated SST file size. + std::atomic estimated_file_size; + }; + FileSizeEstimator file_size_estimator; + + // Facilities used for waiting first block completion. Need to Wait for + // the completion of first block compression and flush to get a non-zero + // compression ratio. + std::atomic first_block_processed; std::condition_variable first_block_cond; std::mutex first_block_mutex; - bool finished; - - ParallelCompressionRep(uint32_t parallel_threads) + explicit ParallelCompressionRep(uint32_t parallel_threads) : curr_block_keys(new Keys()), block_rep_buf(parallel_threads), block_rep_pool(parallel_threads), compress_queue(parallel_threads), write_queue(parallel_threads), - raw_bytes_compressed(0), - raw_bytes_curr_block(0), - raw_bytes_inflight(0), - blocks_inflight(0), - curr_compression_ratio(0), - estimated_file_size(0), - first_block(true), - finished(false) { + first_block_processed(false) { for (uint32_t i = 0; i < parallel_threads; i++) { block_rep_buf[i].contents = Slice(); block_rep_buf[i].compressed_contents = Slice(); @@ -643,27 +768,98 @@ struct BlockBasedTableBuilder::ParallelCompressionRep { } ~ParallelCompressionRep() { block_rep_pool.finish(); } + + // Make a block prepared to be emitted to compression thread + // Used in non-buffered mode + BlockRep* PrepareBlock(CompressionType compression_type, + const Slice* first_key_in_next_block, + BlockBuilder* data_block) { + BlockRep* block_rep = + PrepareBlockInternal(compression_type, first_key_in_next_block); + assert(block_rep != nullptr); + data_block->SwapAndReset(*(block_rep->data)); + block_rep->contents = *(block_rep->data); + std::swap(block_rep->keys, curr_block_keys); + curr_block_keys->Clear(); + return block_rep; + } + + // Used in EnterUnbuffered + BlockRep* PrepareBlock(CompressionType compression_type, + const Slice* first_key_in_next_block, + std::string* data_block, + std::vector* keys) { + BlockRep* block_rep = + PrepareBlockInternal(compression_type, first_key_in_next_block); + assert(block_rep != nullptr); + std::swap(*(block_rep->data), *data_block); + block_rep->contents = *(block_rep->data); + block_rep->keys->SwapAssign(*keys); + return block_rep; + } + + // Emit a block to compression thread + void EmitBlock(BlockRep* block_rep) { + assert(block_rep != nullptr); + assert(block_rep->status.ok()); + if (!write_queue.push(block_rep->slot.get())) { + return; + } + if (!compress_queue.push(block_rep)) { + return; + } + + if (!first_block_processed.load(std::memory_order_relaxed)) { + std::unique_lock lock(first_block_mutex); + first_block_cond.wait(lock, [this] { + return first_block_processed.load(std::memory_order_relaxed); + }); + } + } + + // Reap a block from compression thread + void ReapBlock(BlockRep* block_rep) { + assert(block_rep != nullptr); + block_rep->compressed_data->clear(); + block_rep_pool.push(block_rep); + + if (!first_block_processed.load(std::memory_order_relaxed)) { + std::lock_guard lock(first_block_mutex); + first_block_processed.store(true, std::memory_order_relaxed); + first_block_cond.notify_one(); + } + } + + private: + BlockRep* PrepareBlockInternal(CompressionType compression_type, + const Slice* first_key_in_next_block) { + BlockRep* block_rep = nullptr; + block_rep_pool.pop(block_rep); + assert(block_rep != nullptr); + + assert(block_rep->data); + + block_rep->compression_type = compression_type; + + if (first_key_in_next_block == nullptr) { + block_rep->first_key_in_next_block.reset(nullptr); + } else { + block_rep->first_key_in_next_block->assign( + first_key_in_next_block->data(), first_key_in_next_block->size()); + } + + return block_rep; + } }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, WritableFileWriter* file, - const CompressionType compression_type, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, const bool skip_filters, - const std::string& column_family_name, const int level_at_creation, - const uint64_t creation_time, const uint64_t oldest_key_time, - const uint64_t target_file_size, const uint64_t file_creation_time, - const std::string& db_id, const std::string& db_session_id) { + const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo, + WritableFileWriter* file) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && sanitized_table_options.checksum != kCRC32c) { ROCKS_LOG_WARN( - ioptions.info_log, + tbo.ioptions.logger, "Silently converting format_version to 1 because checksum is " "non-default"); // silently convert format_version to 1 to keep consistent with current @@ -671,36 +867,16 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( sanitized_table_options.format_version = 1; } - rep_ = new Rep( - ioptions, moptions, sanitized_table_options, internal_comparator, - int_tbl_prop_collector_factories, column_family_id, file, - compression_type, sample_for_compression, compression_opts, skip_filters, - level_at_creation, column_family_name, creation_time, oldest_key_time, - target_file_size, file_creation_time, db_id, db_session_id); + rep_ = new Rep(sanitized_table_options, tbo, file); if (rep_->filter_builder != nullptr) { rep_->filter_builder->StartBlock(0); } - if (table_options.block_cache_compressed.get() != nullptr) { - BlockBasedTable::GenerateCachePrefix( - table_options.block_cache_compressed.get(), file->writable_file(), - &rep_->compressed_cache_key_prefix[0], - &rep_->compressed_cache_key_prefix_size); - } - - if (rep_->compression_opts.parallel_threads > 1) { - rep_->pc_rep.reset( - new ParallelCompressionRep(rep_->compression_opts.parallel_threads)); - rep_->pc_rep->compress_thread_pool.reserve( - rep_->compression_opts.parallel_threads); - for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) { - rep_->pc_rep->compress_thread_pool.emplace_back([this, i] { - BGWorkCompression(*(rep_->compression_ctxs[i]), - rep_->verify_ctxs[i].get()); - }); - } - rep_->pc_rep->write_thread.reset( - new port::Thread([this] { BGWorkWriteRawBlock(); })); + + SetupCacheKeyPrefix(tbo); + + if (rep_->IsParallelCompressionEnabled()) { + StartParallelCompression(); } } @@ -728,8 +904,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->first_key_in_next_block = &key; Flush(); - if (r->state == Rep::State::kBuffered && r->target_file_size != 0 && - r->data_begin_offset > r->target_file_size) { + if (r->state == Rep::State::kBuffered && r->buffer_limit != 0 && + r->data_begin_offset > r->buffer_limit) { EnterUnbuffered(); } @@ -742,7 +918,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // entries in the first block and < all entries in subsequent // blocks. if (ok() && r->state == Rep::State::kUnbuffered) { - if (r->compression_opts.parallel_threads > 1) { + if (r->IsParallelCompressionEnabled()) { r->pc_rep->curr_block_keys->Clear(); } else { r->index_builder->AddIndexEntry(&r->last_key, &key, @@ -754,7 +930,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // Note: PartitionedFilterBlockBuilder requires key being added to filter // builder after being added to index builder. if (r->state == Rep::State::kUnbuffered) { - if (r->compression_opts.parallel_threads > 1) { + if (r->IsParallelCompressionEnabled()) { r->pc_rep->curr_block_keys->PushBack(key); } else { if (r->filter_builder != nullptr) { @@ -768,28 +944,24 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->last_key.assign(key.data(), key.size()); r->data_block.Add(key, value); if (r->state == Rep::State::kBuffered) { - // Buffer keys to be replayed during `Finish()` once compression - // dictionary has been finalized. - if (r->data_block_and_keys_buffers.empty() || should_flush) { - r->data_block_and_keys_buffers.emplace_back(); - } - r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString()); + // Buffered keys will be replayed from data_block_buffers during + // `Finish()` once compression dictionary has been finalized. } else { - if (r->compression_opts.parallel_threads == 1) { + if (!r->IsParallelCompressionEnabled()) { r->index_builder->OnKeyAdded(key); } } // TODO offset passed in is not accurate for parallel compression case NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), r->table_properties_collectors, - r->ioptions.info_log); + r->ioptions.logger); } else if (value_type == kTypeRangeDeletion) { r->range_del_block.Add(key, value); // TODO offset passed in is not accurate for parallel compression case NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), r->table_properties_collectors, - r->ioptions.info_log); + r->ioptions.logger); } else { assert(false); } @@ -812,61 +984,15 @@ void BlockBasedTableBuilder::Flush() { assert(rep_->state != Rep::State::kClosed); if (!ok()) return; if (r->data_block.empty()) return; - if (r->compression_opts.parallel_threads > 1 && + if (r->IsParallelCompressionEnabled() && r->state == Rep::State::kUnbuffered) { - ParallelCompressionRep::BlockRep* block_rep = nullptr; - r->pc_rep->block_rep_pool.pop(block_rep); - assert(block_rep != nullptr); - r->data_block.Finish(); - assert(block_rep->data); - r->data_block.SwapAndReset(*(block_rep->data)); - - block_rep->contents = *(block_rep->data); - - block_rep->compression_type = r->compression_type; - - std::swap(block_rep->keys, r->pc_rep->curr_block_keys); - r->pc_rep->curr_block_keys->Clear(); - - if (r->first_key_in_next_block == nullptr) { - block_rep->first_key_in_next_block.reset(nullptr); - } else { - block_rep->first_key_in_next_block->assign( - r->first_key_in_next_block->data(), - r->first_key_in_next_block->size()); - } - - uint64_t new_raw_bytes_inflight = - r->pc_rep->raw_bytes_inflight.fetch_add(block_rep->data->size(), - std::memory_order_relaxed) + - block_rep->data->size(); - uint64_t new_blocks_inflight = - r->pc_rep->blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1; - r->pc_rep->estimated_file_size.store( - r->get_offset() + - static_cast(static_cast(new_raw_bytes_inflight) * - r->pc_rep->curr_compression_ratio.load( - std::memory_order_relaxed)) + - new_blocks_inflight * kBlockTrailerSize, - std::memory_order_relaxed); - - // Read out first_block here to avoid data race with BGWorkWriteRawBlock - bool first_block = r->pc_rep->first_block; - - assert(block_rep->status.ok()); - if (!r->pc_rep->write_queue.push(block_rep->slot.get())) { - return; - } - if (!r->pc_rep->compress_queue.push(block_rep)) { - return; - } - - if (first_block) { - std::unique_lock lock(r->pc_rep->first_block_mutex); - r->pc_rep->first_block_cond.wait(lock, - [r] { return !r->pc_rep->first_block; }); - } + ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( + r->compression_type, r->first_key_in_next_block, &(r->data_block)); + assert(block_rep != nullptr); + r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), + r->get_offset()); + r->pc_rep->EmitBlock(block_rep); } else { WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */); } @@ -875,23 +1001,25 @@ void BlockBasedTableBuilder::Flush() { void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block) { - WriteBlock(block->Finish(), handle, is_data_block); - block->Reset(); + block->Finish(); + std::string raw_block_contents; + block->SwapAndReset(raw_block_contents); + if (rep_->state == Rep::State::kBuffered) { + assert(is_data_block); + rep_->data_block_buffers.emplace_back(std::move(raw_block_contents)); + rep_->data_begin_offset += rep_->data_block_buffers.back().size(); + return; + } + WriteBlock(raw_block_contents, handle, is_data_block); } void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, BlockHandle* handle, bool is_data_block) { Rep* r = rep_; + assert(r->state == Rep::State::kUnbuffered); Slice block_contents; CompressionType type; - if (r->state == Rep::State::kBuffered) { - assert(is_data_block); - assert(!r->data_block_and_keys_buffers.empty()); - r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString(); - r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size(); - return; - } Status compress_status; CompressAndVerifyBlock(raw_block_contents, is_data_block, *(r->compression_ctxs[0]), r->verify_ctxs[0].get(), @@ -901,7 +1029,9 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, if (!ok()) { return; } - WriteRawBlock(block_contents, type, handle, is_data_block); + + WriteRawBlock(block_contents, type, handle, is_data_block, + &raw_block_contents); r->compressed_output.clear(); if (is_data_block) { if (r->filter_builder != nullptr) { @@ -913,9 +1043,11 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, } void BlockBasedTableBuilder::BGWorkCompression( - CompressionContext& compression_ctx, UncompressionContext* verify_ctx) { - ParallelCompressionRep::BlockRep* block_rep; + const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx) { + ParallelCompressionRep::BlockRep* block_rep = nullptr; while (rep_->pc_rep->compress_queue.pop(block_rep)) { + assert(block_rep != nullptr); CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/ compression_ctx, verify_ctx, block_rep->compressed_data.get(), @@ -927,25 +1059,32 @@ void BlockBasedTableBuilder::BGWorkCompression( void BlockBasedTableBuilder::CompressAndVerifyBlock( const Slice& raw_block_contents, bool is_data_block, - CompressionContext& compression_ctx, UncompressionContext* verify_ctx_ptr, + const CompressionContext& compression_ctx, UncompressionContext* verify_ctx, std::string* compressed_output, Slice* block_contents, CompressionType* type, Status* out_status) { // File format contains a sequence of blocks where each block has: // block_data: uint8[n] // type: uint8 // crc: uint32 - assert(ok()); Rep* r = rep_; + bool is_status_ok = ok(); + if (!r->IsParallelCompressionEnabled()) { + assert(is_status_ok); + } *type = r->compression_type; uint64_t sample_for_compression = r->sample_for_compression; bool abort_compression = false; StopWatchNano timer( - r->ioptions.env, - ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)); + r->ioptions.clock, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); - if (raw_block_contents.size() < kCompressionSizeLimit) { + if (is_status_ok && raw_block_contents.size() < kCompressionSizeLimit) { + if (is_data_block) { + r->compressible_input_data_bytes.fetch_add(raw_block_contents.size(), + std::memory_order_relaxed); + } const CompressionDict* compression_dict; if (!is_data_block || r->compression_dict == nullptr) { compression_dict = &CompressionDict::GetEmptyDict(); @@ -964,6 +1103,16 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( r->table_options.format_version, is_data_block /* do_sample */, compressed_output, &sampled_output_fast, &sampled_output_slow); + if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) { + // Currently compression sampling is only enabled for data block. + assert(is_data_block); + r->sampled_input_data_bytes.fetch_add(raw_block_contents.size(), + std::memory_order_relaxed); + r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(), + std::memory_order_relaxed); + r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(), + std::memory_order_relaxed); + } // notify collectors on block add NotifyCollectTableCollectorsOnBlockAdd( r->table_properties_collectors, raw_block_contents.size(), @@ -982,7 +1131,7 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( } assert(verify_dict != nullptr); BlockContents contents; - UncompressionInfo uncompression_info(*verify_ctx_ptr, *verify_dict, + UncompressionInfo uncompression_info(*verify_ctx, *verify_dict, r->compression_type); Status stat = UncompressBlockContentsForCompressionType( uncompression_info, block_contents->data(), block_contents->size(), @@ -993,7 +1142,7 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( if (!compressed_ok) { // The result of the compression was invalid. abort. abort_compression = true; - ROCKS_LOG_ERROR(r->ioptions.info_log, + ROCKS_LOG_ERROR(r->ioptions.logger, "Decompressed block did not match raw block"); *out_status = Status::Corruption("Decompressed block did not match raw block"); @@ -1007,36 +1156,45 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( } } else { // Block is too big to be compressed. + if (is_data_block) { + r->uncompressible_input_data_bytes.fetch_add(raw_block_contents.size(), + std::memory_order_relaxed); + } abort_compression = true; } + if (is_data_block) { + r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize, + std::memory_order_relaxed); + } // Abort compression if the block is too big, or did not pass // verification. if (abort_compression) { - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED); *type = kNoCompression; *block_contents = raw_block_contents; } else if (*type != kNoCompression) { - if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) { - RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS, + if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) { + RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, timer.ElapsedNanos()); } - RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED, + RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED, raw_block_contents.size()); - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED); + RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED); } else if (*type != r->compression_type) { - RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED); } } void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle, - bool is_data_block) { + bool is_data_block, + const Slice* raw_block_contents) { Rep* r = rep_; Status s = Status::OK(); IOStatus io_s = IOStatus::OK(); - StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); + StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->get_offset()); handle->set_size(block_contents.size()); assert(status().ok()); @@ -1089,7 +1247,21 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, io_s = r->file->Append(Slice(trailer, kBlockTrailerSize)); if (io_s.ok()) { assert(s.ok()); - s = InsertBlockInCache(block_contents, type, handle); + if (is_data_block && + r->table_options.prepopulate_block_cache == + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly) { + if (type == kNoCompression) { + s = InsertBlockInCache(block_contents, handle); + } else if (raw_block_contents != nullptr) { + s = InsertBlockInCache(*raw_block_contents, handle); + } + if (!s.ok()) { + r->SetStatus(s); + } + } + // TODO:: Should InsertBlockInCompressedCache take into account error from + // InsertBlockInCache or ignore and overwrite it. + s = InsertBlockInCompressedCache(block_contents, type, handle); if (!s.ok()) { r->SetStatus(s); } @@ -1111,39 +1283,12 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, r->SetIOStatus(io_s); } } - if (r->compression_opts.parallel_threads > 1) { - if (!r->pc_rep->finished) { - assert(r->pc_rep->raw_bytes_compressed + - r->pc_rep->raw_bytes_curr_block > - 0); - r->pc_rep->curr_compression_ratio.store( - (r->pc_rep->curr_compression_ratio.load( - std::memory_order_relaxed) * - r->pc_rep->raw_bytes_compressed + - block_contents.size()) / - static_cast(r->pc_rep->raw_bytes_compressed + - r->pc_rep->raw_bytes_curr_block), - std::memory_order_relaxed); - r->pc_rep->raw_bytes_compressed += r->pc_rep->raw_bytes_curr_block; - uint64_t new_raw_bytes_inflight = - r->pc_rep->raw_bytes_inflight.fetch_sub( - r->pc_rep->raw_bytes_curr_block, std::memory_order_relaxed) - - r->pc_rep->raw_bytes_curr_block; - uint64_t new_blocks_inflight = r->pc_rep->blocks_inflight.fetch_sub( - 1, std::memory_order_relaxed) - - 1; - assert(new_blocks_inflight < r->compression_opts.parallel_threads); - r->pc_rep->estimated_file_size.store( - r->get_offset() + - static_cast( - static_cast(new_raw_bytes_inflight) * - r->pc_rep->curr_compression_ratio.load( - std::memory_order_relaxed)) + - new_blocks_inflight * kBlockTrailerSize, - std::memory_order_relaxed); + if (r->IsParallelCompressionEnabled()) { + if (is_data_block) { + r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(), + r->get_offset()); } else { - r->pc_rep->estimated_file_size.store(r->get_offset(), - std::memory_order_relaxed); + r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset()); } } } @@ -1157,24 +1302,19 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, void BlockBasedTableBuilder::BGWorkWriteRawBlock() { Rep* r = rep_; - ParallelCompressionRep::BlockRepSlot* slot; - ParallelCompressionRep::BlockRep* block_rep; + ParallelCompressionRep::BlockRepSlot* slot = nullptr; + ParallelCompressionRep::BlockRep* block_rep = nullptr; while (r->pc_rep->write_queue.pop(slot)) { + assert(slot != nullptr); slot->Take(block_rep); + assert(block_rep != nullptr); if (!block_rep->status.ok()) { r->SetStatus(block_rep->status); - // Return block_rep to the pool so that blocked Flush() can finish + // Reap block so that blocked Flush() can finish // if there is one, and Flush() will notice !ok() next time. block_rep->status = Status::OK(); - block_rep->compressed_data->clear(); - r->pc_rep->block_rep_pool.push(block_rep); - // Unlock first block if necessary. - if (r->pc_rep->first_block) { - std::lock_guard lock(r->pc_rep->first_block_mutex); - r->pc_rep->first_block = false; - r->pc_rep->first_block_cond.notify_one(); - } - break; + r->pc_rep->ReapBlock(block_rep); + continue; } for (size_t i = 0; i < block_rep->keys->Size(); i++) { @@ -1187,19 +1327,15 @@ void BlockBasedTableBuilder::BGWorkWriteRawBlock() { r->index_builder->OnKeyAdded(key); } - r->pc_rep->raw_bytes_curr_block = block_rep->data->size(); + r->pc_rep->file_size_estimator.SetCurrBlockRawSize(block_rep->data->size()); + WriteRawBlock(block_rep->compressed_contents, block_rep->compression_type, - &r->pending_handle, true /* is_data_block*/); + &r->pending_handle, true /* is_data_block*/, + &block_rep->contents); if (!ok()) { break; } - if (r->pc_rep->first_block) { - std::lock_guard lock(r->pc_rep->first_block_mutex); - r->pc_rep->first_block = false; - r->pc_rep->first_block_cond.notify_one(); - } - if (r->filter_builder != nullptr) { r->filter_builder->StartBlock(r->get_offset()); } @@ -1216,31 +1352,77 @@ void BlockBasedTableBuilder::BGWorkWriteRawBlock() { &first_key_in_next_block, r->pending_handle); } - block_rep->compressed_data->clear(); - r->pc_rep->block_rep_pool.push(block_rep); + + r->pc_rep->ReapBlock(block_rep); } } +void BlockBasedTableBuilder::StartParallelCompression() { + rep_->pc_rep.reset( + new ParallelCompressionRep(rep_->compression_opts.parallel_threads)); + rep_->pc_rep->compress_thread_pool.reserve( + rep_->compression_opts.parallel_threads); + for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) { + rep_->pc_rep->compress_thread_pool.emplace_back([this, i] { + BGWorkCompression(*(rep_->compression_ctxs[i]), + rep_->verify_ctxs[i].get()); + }); + } + rep_->pc_rep->write_thread.reset( + new port::Thread([this] { BGWorkWriteRawBlock(); })); +} + +void BlockBasedTableBuilder::StopParallelCompression() { + rep_->pc_rep->compress_queue.finish(); + for (auto& thread : rep_->pc_rep->compress_thread_pool) { + thread.join(); + } + rep_->pc_rep->write_queue.finish(); + rep_->pc_rep->write_thread->join(); +} + Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); } IOStatus BlockBasedTableBuilder::io_status() const { return rep_->GetIOStatus(); } -static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) { - BlockContents* bc = reinterpret_cast(value); - delete bc; +namespace { +// Delete the entry resided in the cache. +template +void DeleteEntryCached(const Slice& /*key*/, void* value) { + auto entry = reinterpret_cast(value); + delete entry; +} +} // namespace + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTableBuilder::SetupCacheKeyPrefix( + const TableBuilderOptions& tbo) { + if (rep_->table_options.block_cache.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + rep_->table_options.block_cache.get(), rep_->file->writable_file(), + &rep_->cache_key_prefix[0], &rep_->cache_key_prefix_size, + tbo.db_session_id, tbo.cur_file_num); + } + if (rep_->table_options.block_cache_compressed.get() != nullptr) { + BlockBasedTable::GenerateCachePrefix( + rep_->table_options.block_cache_compressed.get(), + rep_->file->writable_file(), &rep_->compressed_cache_key_prefix[0], + &rep_->compressed_cache_key_prefix_size, tbo.db_session_id, + tbo.cur_file_num); + } } // // Make a copy of the block contents and insert into compressed block cache // -Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, - const CompressionType type, - const BlockHandle* handle) { +Status BlockBasedTableBuilder::InsertBlockInCompressedCache( + const Slice& block_contents, const CompressionType type, + const BlockHandle* handle) { Rep* r = rep_; Cache* block_cache_compressed = r->table_options.block_cache_compressed.get(); - + Status s; if (type != kNoCompression && block_cache_compressed != nullptr) { size_t size = block_contents.size(); @@ -1262,27 +1444,63 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, Slice key(r->compressed_cache_key_prefix, static_cast(end - r->compressed_cache_key_prefix)); - // Insert into compressed block cache. - // How should we deal with compressed cache full? - block_cache_compressed - ->Insert(key, block_contents_to_cache, - block_contents_to_cache->ApproximateMemoryUsage(), - &DeleteCachedBlockContents) - .PermitUncheckedError(); - + s = block_cache_compressed->Insert( + key, block_contents_to_cache, + block_contents_to_cache->ApproximateMemoryUsage(), + &DeleteEntryCached); + if (s.ok()) { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } // Invalidate OS cache. r->file->InvalidateCache(static_cast(r->get_offset()), size) .PermitUncheckedError(); } - return Status::OK(); + return s; +} + +Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, + const BlockHandle* handle) { + // Uncompressed regular block cache + Cache* block_cache = rep_->table_options.block_cache.get(); + Status s; + if (block_cache != nullptr) { + size_t size = block_contents.size(); + auto buf = AllocateBlock(size, block_cache->memory_allocator()); + memcpy(buf.get(), block_contents.data(), size); + BlockContents results(std::move(buf), size); + + char + cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key = BlockBasedTable::GetCacheKey(rep_->cache_key_prefix, + rep_->cache_key_prefix_size, + *handle, cache_key); + + const size_t read_amp_bytes_per_bit = + rep_->table_options.read_amp_bytes_per_bit; + Block* block = new Block(std::move(results), read_amp_bytes_per_bit); + size_t charge = block->ApproximateMemoryUsage(); + s = block_cache->Insert(key, block, charge, &DeleteEntryCached); + if (s.ok()) { + BlockBasedTable::UpdateCacheInsertionMetrics( + BlockType::kData, nullptr /*get_context*/, charge, + s.IsOkOverwritten(), rep_->ioptions.stats); + } else { + RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES); + } + } + return s; } void BlockBasedTableBuilder::WriteFilterBlock( MetaIndexBuilder* meta_index_builder) { BlockHandle filter_block_handle; - bool empty_filter_block = (rep_->filter_builder == nullptr || - rep_->filter_builder->NumAdded() == 0); + bool empty_filter_block = + (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty()); if (ok() && !empty_filter_block) { + rep_->props.num_filter_entries += + rep_->filter_builder->EstimateEntriesAdded(); Status s = Status::Incomplete(); while (ok() && s.IsIncomplete()) { Slice filter_content = @@ -1339,20 +1557,23 @@ void BlockBasedTableBuilder::WriteIndexBlock( } } // If there are more index partitions, finish them and write them out - Status s = index_builder_status; - while (ok() && s.IsIncomplete()) { - s = rep_->index_builder->Finish(&index_blocks, *index_block_handle); - if (!s.ok() && !s.IsIncomplete()) { - rep_->SetStatus(s); - return; - } - if (rep_->table_options.enable_index_compression) { - WriteBlock(index_blocks.index_block_contents, index_block_handle, false); - } else { - WriteRawBlock(index_blocks.index_block_contents, kNoCompression, - index_block_handle); + if (index_builder_status.IsIncomplete()) { + Status s = Status::Incomplete(); + while (ok() && s.IsIncomplete()) { + s = rep_->index_builder->Finish(&index_blocks, *index_block_handle); + if (!s.ok() && !s.IsIncomplete()) { + rep_->SetStatus(s); + return; + } + if (rep_->table_options.enable_index_compression) { + WriteBlock(index_blocks.index_block_contents, index_block_handle, + false); + } else { + WriteRawBlock(index_blocks.index_block_contents, kNoCompression, + index_block_handle); + } + // The last index_block_handle will be for the partition index block } - // The last index_block_handle will be for the partition index block } } @@ -1410,15 +1631,37 @@ void BlockBasedTableBuilder::WritePropertiesBlock( rep_->props.creation_time = rep_->creation_time; rep_->props.oldest_key_time = rep_->oldest_key_time; rep_->props.file_creation_time = rep_->file_creation_time; + if (rep_->sampled_input_data_bytes > 0) { + rep_->props.slow_compression_estimated_data_size = static_cast( + static_cast(rep_->sampled_output_slow_data_bytes) / + rep_->sampled_input_data_bytes * + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes + 0.5); + rep_->props.fast_compression_estimated_data_size = static_cast( + static_cast(rep_->sampled_output_fast_data_bytes) / + rep_->sampled_input_data_bytes * + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes + 0.5); + } else if (rep_->sample_for_compression > 0) { + // We tried to sample but none were found. Assume worst-case (compression + // ratio 1.0) so data is complete and aggregatable. + rep_->props.slow_compression_estimated_data_size = + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes; + rep_->props.fast_compression_estimated_data_size = + rep_->compressible_input_data_bytes + + rep_->uncompressible_input_data_bytes; + } rep_->props.db_id = rep_->db_id; rep_->props.db_session_id = rep_->db_session_id; + rep_->props.db_host_id = rep_->db_host_id; // Add basic properties property_block_builder.AddTableProperty(rep_->props); // Add use collected properties NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors, - rep_->ioptions.info_log, + rep_->ioptions.logger, &property_block_builder); WriteRawBlock(property_block_builder.Finish(), kNoCompression, @@ -1497,11 +1740,12 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, footer.EncodeTo(&footer_encoding); assert(ok()); IOStatus ios = r->file->Append(footer_encoding); - r->SetIOStatus(ios); if (ios.ok()) { r->set_offset(r->get_offset() + footer_encoding.size()); + } else { + r->SetIOStatus(ios); + r->SetStatus(ios); } - r->SyncStatusFromIOStatus(); } void BlockBasedTableBuilder::EnterUnbuffered() { @@ -1511,20 +1755,45 @@ void BlockBasedTableBuilder::EnterUnbuffered() { const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 ? r->compression_opts.zstd_max_train_bytes : r->compression_opts.max_dict_bytes; - Random64 generator{r->creation_time}; + const size_t kNumBlocksBuffered = r->data_block_buffers.size(); + if (kNumBlocksBuffered == 0) { + // The below code is neither safe nor necessary for handling zero data + // blocks. + return; + } + + // Abstract algebra teaches us that a finite cyclic group (such as the + // additive group of integers modulo N) can be generated by a number that is + // coprime with N. Since N is variable (number of buffered data blocks), we + // must then pick a prime number in order to guarantee coprimeness with any N. + // + // One downside of this approach is the spread will be poor when + // `kPrimeGeneratorRemainder` is close to zero or close to + // `kNumBlocksBuffered`. + // + // Picked a random number between one and one trillion and then chose the + // next prime number greater than or equal to it. + const uint64_t kPrimeGenerator = 545055921143ull; + // Can avoid repeated division by just adding the remainder repeatedly. + const size_t kPrimeGeneratorRemainder = static_cast( + kPrimeGenerator % static_cast(kNumBlocksBuffered)); + const size_t kInitSampleIdx = kNumBlocksBuffered / 2; + std::string compression_dict_samples; std::vector compression_dict_sample_lens; - if (!r->data_block_and_keys_buffers.empty()) { - while (compression_dict_samples.size() < kSampleBytes) { - size_t rand_idx = - static_cast( - generator.Uniform(r->data_block_and_keys_buffers.size())); - size_t copy_len = - std::min(kSampleBytes - compression_dict_samples.size(), - r->data_block_and_keys_buffers[rand_idx].first.size()); - compression_dict_samples.append( - r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len); - compression_dict_sample_lens.emplace_back(copy_len); + size_t buffer_idx = kInitSampleIdx; + for (size_t i = 0; + i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes; + ++i) { + size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(), + r->data_block_buffers[buffer_idx].size()); + compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0, + copy_len); + compression_dict_sample_lens.emplace_back(copy_len); + + buffer_idx += kPrimeGeneratorRemainder; + if (buffer_idx >= kNumBlocksBuffered) { + buffer_idx -= kNumBlocksBuffered; } } @@ -1544,70 +1813,58 @@ void BlockBasedTableBuilder::EnterUnbuffered() { dict, r->compression_type == kZSTD || r->compression_type == kZSTDNotFinalCompression)); - for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) { - auto& data_block = r->data_block_and_keys_buffers[i].first; - auto& keys = r->data_block_and_keys_buffers[i].second; + auto get_iterator_for_block = [&r](size_t i) { + auto& data_block = r->data_block_buffers[i]; assert(!data_block.empty()); - assert(!keys.empty()); - if (r->compression_opts.parallel_threads > 1) { - ParallelCompressionRep::BlockRep* block_rep; - r->pc_rep->block_rep_pool.pop(block_rep); + Block reader{BlockContents{data_block}}; + DataBlockIter* iter = reader.NewDataIterator( + r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber); - std::swap(*(block_rep->data), data_block); - block_rep->contents = *(block_rep->data); - - block_rep->compression_type = r->compression_type; + iter->SeekToFirst(); + assert(iter->Valid()); + return std::unique_ptr(iter); + }; - block_rep->keys->SwapAssign(keys); + std::unique_ptr iter = nullptr, next_block_iter = nullptr; - if (i + 1 < r->data_block_and_keys_buffers.size()) { - block_rep->first_key_in_next_block->assign( - r->data_block_and_keys_buffers[i + 1].second.front()); - } else { - if (r->first_key_in_next_block == nullptr) { - block_rep->first_key_in_next_block.reset(nullptr); - } else { - block_rep->first_key_in_next_block->assign( - r->first_key_in_next_block->data(), - r->first_key_in_next_block->size()); - } - } + for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) { + if (iter == nullptr) { + iter = get_iterator_for_block(i); + assert(iter != nullptr); + }; - uint64_t new_raw_bytes_inflight = - r->pc_rep->raw_bytes_inflight.fetch_add(block_rep->data->size(), - std::memory_order_relaxed) + - block_rep->data->size(); - uint64_t new_blocks_inflight = - r->pc_rep->blocks_inflight.fetch_add(1, std::memory_order_relaxed) + - 1; - r->pc_rep->estimated_file_size.store( - r->get_offset() + - static_cast( - static_cast(new_raw_bytes_inflight) * - r->pc_rep->curr_compression_ratio.load( - std::memory_order_relaxed)) + - new_blocks_inflight * kBlockTrailerSize, - std::memory_order_relaxed); + if (i + 1 < r->data_block_buffers.size()) { + next_block_iter = get_iterator_for_block(i + 1); + } - // Read out first_block here to avoid data race with BGWorkWriteRawBlock - bool first_block = r->pc_rep->first_block; + auto& data_block = r->data_block_buffers[i]; - assert(block_rep->status.ok()); - if (!r->pc_rep->write_queue.push(block_rep->slot.get())) { - return; - } - if (!r->pc_rep->compress_queue.push(block_rep)) { - return; + if (r->IsParallelCompressionEnabled()) { + Slice first_key_in_next_block; + const Slice* first_key_in_next_block_ptr = &first_key_in_next_block; + if (i + 1 < r->data_block_buffers.size()) { + assert(next_block_iter != nullptr); + first_key_in_next_block = next_block_iter->key(); + } else { + first_key_in_next_block_ptr = r->first_key_in_next_block; } - if (first_block) { - std::unique_lock lock(r->pc_rep->first_block_mutex); - r->pc_rep->first_block_cond.wait( - lock, [r] { return !r->pc_rep->first_block; }); + std::vector keys; + for (; iter->Valid(); iter->Next()) { + keys.emplace_back(iter->key().ToString()); } + + ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock( + r->compression_type, first_key_in_next_block_ptr, &data_block, &keys); + + assert(block_rep != nullptr); + r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(), + r->get_offset()); + r->pc_rep->EmitBlock(block_rep); } else { - for (const auto& key : keys) { + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); if (r->filter_builder != nullptr) { size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); @@ -1617,16 +1874,22 @@ void BlockBasedTableBuilder::EnterUnbuffered() { } WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */); - if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) { - Slice first_key_in_next_block = - r->data_block_and_keys_buffers[i + 1].second.front(); + if (ok() && i + 1 < r->data_block_buffers.size()) { + assert(next_block_iter != nullptr); + Slice first_key_in_next_block = next_block_iter->key(); + Slice* first_key_in_next_block_ptr = &first_key_in_next_block; - r->index_builder->AddIndexEntry( - &keys.back(), first_key_in_next_block_ptr, r->pending_handle); + + iter->SeekToLast(); + std::string last_key = iter->key().ToString(); + r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr, + r->pending_handle); } } + + std::swap(iter, next_block_iter); } - r->data_block_and_keys_buffers.clear(); + r->data_block_buffers.clear(); } Status BlockBasedTableBuilder::Finish() { @@ -1638,14 +1901,8 @@ Status BlockBasedTableBuilder::Finish() { if (r->state == Rep::State::kBuffered) { EnterUnbuffered(); } - if (r->compression_opts.parallel_threads > 1) { - r->pc_rep->compress_queue.finish(); - for (auto& thread : r->pc_rep->compress_thread_pool) { - thread.join(); - } - r->pc_rep->write_queue.finish(); - r->pc_rep->write_thread->join(); - r->pc_rep->finished = true; + if (r->IsParallelCompressionEnabled()) { + StopParallelCompression(); #ifndef NDEBUG for (const auto& br : r->pc_rep->block_rep_buf) { assert(br.status.ok()); @@ -1684,25 +1941,20 @@ Status BlockBasedTableBuilder::Finish() { WriteFooter(metaindex_block_handle, index_block_handle); } r->state = Rep::State::kClosed; - Status ret_status = r->GetStatus(); + r->SetStatus(r->CopyIOStatus()); + Status ret_status = r->CopyStatus(); assert(!ret_status.ok() || io_status().ok()); return ret_status; } void BlockBasedTableBuilder::Abandon() { assert(rep_->state != Rep::State::kClosed); - if (rep_->compression_opts.parallel_threads > 1) { - rep_->pc_rep->compress_queue.finish(); - for (auto& thread : rep_->pc_rep->compress_thread_pool) { - thread.join(); - } - rep_->pc_rep->write_queue.finish(); - rep_->pc_rep->write_thread->join(); - rep_->pc_rep->finished = true; + if (rep_->IsParallelCompressionEnabled()) { + StopParallelCompression(); } rep_->state = Rep::State::kClosed; - rep_->GetStatus().PermitUncheckedError(); - rep_->GetIOStatus().PermitUncheckedError(); + rep_->CopyStatus().PermitUncheckedError(); + rep_->CopyIOStatus().PermitUncheckedError(); } uint64_t BlockBasedTableBuilder::NumEntries() const { @@ -1716,10 +1968,10 @@ bool BlockBasedTableBuilder::IsEmpty() const { uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; } uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { - if (rep_->compression_opts.parallel_threads > 1) { + if (rep_->IsParallelCompressionEnabled()) { // Use compression ratio so far and inflight raw bytes to estimate // final SST size. - return rep_->pc_rep->estimated_file_size.load(std::memory_order_relaxed); + return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize(); } else { return FileSize(); } diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 2e3081d26ce..65be35b1948 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -38,21 +38,9 @@ class BlockBasedTableBuilder : public TableBuilder { // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). - BlockBasedTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - const std::vector>* - int_tbl_prop_collector_factories, - uint32_t column_family_id, WritableFileWriter* file, - const CompressionType compression_type, - const uint64_t sample_for_compression, - const CompressionOptions& compression_opts, const bool skip_filters, - const std::string& column_family_name, const int level_at_creation, - const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, - const uint64_t target_file_size = 0, - const uint64_t file_creation_time = 0, const std::string& db_id = "", - const std::string& db_session_id = ""); + BlockBasedTableBuilder(const BlockBasedTableOptions& table_options, + const TableBuilderOptions& table_builder_options, + WritableFileWriter* file); // No copying allowed BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; @@ -117,8 +105,9 @@ class BlockBasedTableBuilder : public TableBuilder { // REQUIRES: `rep_->state == kBuffered` void EnterUnbuffered(); - // Call block's Finish() method - // and then write the compressed block contents to file. + // Call block's Finish() method and then + // - in buffered mode, buffer the uncompressed block contents. + // - in unbuffered mode, write the compressed block contents to file. void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block); // Compress and write block content to the file. @@ -126,10 +115,16 @@ class BlockBasedTableBuilder : public TableBuilder { bool is_data_block); // Directly write data to the file. void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle, - bool is_data_block = false); + bool is_data_block = false, + const Slice* raw_data = nullptr); + + void SetupCacheKeyPrefix(const TableBuilderOptions& tbo); + Status InsertBlockInCache(const Slice& block_contents, - const CompressionType type, const BlockHandle* handle); + Status InsertBlockInCompressedCache(const Slice& block_contents, + const CompressionType type, + const BlockHandle* handle); void WriteFilterBlock(MetaIndexBuilder* meta_index_builder); void WriteIndexBlock(MetaIndexBuilder* meta_index_builder, @@ -159,19 +154,29 @@ class BlockBasedTableBuilder : public TableBuilder { // Get blocks from mem-table walking thread, compress them and // pass them to the write thread. Used in parallel compression mode only - void BGWorkCompression(CompressionContext& compression_ctx, + void BGWorkCompression(const CompressionContext& compression_ctx, UncompressionContext* verify_ctx); // Given raw block content, try to compress it and return result and // compression type - void CompressAndVerifyBlock( - const Slice& raw_block_contents, bool is_data_block, - CompressionContext& compression_ctx, UncompressionContext* verify_ctx, - std::string* compressed_output, Slice* result_block_contents, - CompressionType* result_compression_type, Status* out_status); + void CompressAndVerifyBlock(const Slice& raw_block_contents, + bool is_data_block, + const CompressionContext& compression_ctx, + UncompressionContext* verify_ctx, + std::string* compressed_output, + Slice* result_block_contents, + CompressionType* result_compression_type, + Status* out_status); // Get compressed blocks from BGWorkCompression and write them into SST void BGWorkWriteRawBlock(); + + // Initialize parallel compression context and + // start BGWorkCompression and BGWorkWriteRawBlock threads + void StartParallelCompression(); + + // Stop BGWorkCompression and BGWorkWriteRawBlock threads + void StopParallelCompression(); }; Slice CompressBlock(const Slice& raw, const CompressionInfo& info, diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 25e38f94c93..831235f489a 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -160,6 +160,16 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() { } #ifndef ROCKSDB_LITE + +const std::string kOptNameMetadataCacheOpts = "metadata_cache_options"; + +static std::unordered_map + pinning_tier_type_string_map = { + {"kFallback", PinningTier::kFallback}, + {"kNone", PinningTier::kNone}, + {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar}, + {"kAll", PinningTier::kAll}}; + static std::unordered_map block_base_table_index_type_string_map = { {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, @@ -187,6 +197,29 @@ static std::unordered_map + metadata_cache_options_type_info = { + {"top_level_index_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, top_level_index_pinning), + &pinning_tier_type_string_map)}, + {"partition_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, partition_pinning), + &pinning_tier_type_string_map)}, + {"unpartitioned_pinning", + OptionTypeInfo::Enum( + offsetof(struct MetadataCacheOptions, unpartitioned_pinning), + &pinning_tier_type_string_map)}}; + +static std::unordered_map + block_base_table_prepopulate_block_cache_string_map = { + {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable}, + {"kFlushOnly", + BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}}; + #endif // ROCKSDB_LITE static std::unordered_map @@ -280,17 +313,16 @@ static std::unordered_map OptionTypeFlags::kNone, // Parses the Filter policy [](const ConfigOptions& opts, const std::string&, - const std::string& value, char* addr) { + const std::string& value, void* addr) { auto* policy = - reinterpret_cast*>(addr); + static_cast*>(addr); return FilterPolicy::CreateFromString(opts, value, policy); }, // Converts the FilterPolicy to its string representation - [](const ConfigOptions&, const std::string&, const char* addr, + [](const ConfigOptions&, const std::string&, const void* addr, std::string* value) { const auto* policy = - reinterpret_cast*>( - addr); + static_cast*>(addr); if (policy->get()) { *value = (*policy)->Name(); } else { @@ -299,15 +331,13 @@ static std::unordered_map return Status::OK(); }, // Compares two FilterPolicy objects for equality - [](const ConfigOptions&, const std::string&, const char* addr1, - const char* addr2, std::string*) { + [](const ConfigOptions&, const std::string&, const void* addr1, + const void* addr2, std::string*) { const auto* policy1 = - reinterpret_cast*>( - addr1) + static_cast*>(addr1) ->get(); const auto* policy2 = - reinterpret_cast*>(addr2) - ->get(); + static_cast*>(addr2)->get(); if (policy1 == policy2) { return true; } else if (policy1 != nullptr && policy2 != nullptr) { @@ -336,7 +366,7 @@ static std::unordered_map OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kNone, [](const ConfigOptions& /*opts*/, const std::string& /*name*/, - const std::string& value, char* addr) { + const std::string& value, void* addr) { // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13 // and 6.14. The bug will write out 8 bytes to OPTIONS file from the // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit @@ -347,7 +377,7 @@ static std::unordered_map // generated by affected releases before the fix, we need to // manually parse read_amp_bytes_per_bit with this special hack. uint64_t read_amp_bytes_per_bit = ParseUint64(value); - *(reinterpret_cast(addr)) = + *(static_cast(addr)) = static_cast(read_amp_bytes_per_bit); return Status::OK(); }}}, @@ -364,14 +394,19 @@ static std::unordered_map pin_top_level_index_and_filter), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {kOptNameMetadataCacheOpts, + OptionTypeInfo::Struct( + kOptNameMetadataCacheOpts, &metadata_cache_options_type_info, + offsetof(struct BlockBasedTableOptions, metadata_cache_options), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, {"block_cache", {offsetof(struct BlockBasedTableOptions, block_cache), OptionType::kUnknown, OptionVerificationType::kNormal, (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), // Parses the input vsalue as a Cache [](const ConfigOptions& opts, const std::string&, - const std::string& value, char* addr) { - auto* cache = reinterpret_cast*>(addr); + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); return Cache::CreateFromString(opts, value, cache); }}}, {"block_cache_compressed", @@ -380,10 +415,19 @@ static std::unordered_map (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), // Parses the input vsalue as a Cache [](const ConfigOptions& opts, const std::string&, - const std::string& value, char* addr) { - auto* cache = reinterpret_cast*>(addr); + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); return Cache::CreateFromString(opts, value, cache); }}}, + {"max_auto_readahead_size", + {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"prepopulate_block_cache", + OptionTypeInfo::Enum( + offsetof(struct BlockBasedTableOptions, prepopulate_block_cache), + &block_base_table_prepopulate_block_cache_string_map)}, + #endif // ROCKSDB_LITE }; @@ -393,8 +437,7 @@ BlockBasedTableFactory::BlockBasedTableFactory( const BlockBasedTableOptions& _table_options) : table_options_(_table_options) { InitializeOptions(); - ConfigurableHelper::RegisterOptions(*this, &table_options_, - &block_based_table_type_info); + RegisterOptions(&table_options_, &block_based_table_type_info); } void BlockBasedTableFactory::InitializeOptions() { @@ -454,28 +497,16 @@ Status BlockBasedTableFactory::NewTableReader( table_reader_options.largest_seqno, table_reader_options.force_direct_prefetch, &tail_prefetch_stats_, table_reader_options.block_cache_tracer, - table_reader_options.max_file_size_for_l0_meta_pin); + table_reader_options.max_file_size_for_l0_meta_pin, + table_reader_options.cur_db_session_id, + table_reader_options.cur_file_num); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { - auto table_builder = new BlockBasedTableBuilder( - table_builder_options.ioptions, table_builder_options.moptions, - table_options_, table_builder_options.internal_comparator, - table_builder_options.int_tbl_prop_collector_factories, column_family_id, - file, table_builder_options.compression_type, - table_builder_options.sample_for_compression, - table_builder_options.compression_opts, - table_builder_options.skip_filters, - table_builder_options.column_family_name, table_builder_options.level, - table_builder_options.creation_time, - table_builder_options.oldest_key_time, - table_builder_options.target_file_size, - table_builder_options.file_creation_time, table_builder_options.db_id, - table_builder_options.db_session_id); - - return table_builder; + return new BlockBasedTableBuilder(table_options_, table_builder_options, + file); } Status BlockBasedTableFactory::ValidateOptions( @@ -656,6 +687,13 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " block_align: %d\n", table_options_.block_align); ret.append(buffer); + snprintf(buffer, kBufferSize, + " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n", + table_options_.max_auto_readahead_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n", + static_cast(table_options_.prepopulate_block_cache)); + ret.append(buffer); return ret; } diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index a7120f8543b..534746b9d30 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -46,6 +46,9 @@ class BlockBasedTableFactory : public TableFactory { ~BlockBasedTableFactory() {} + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kBlockBasedTableName(); } + const char* Name() const override { return kBlockBasedTableName(); } using TableFactory::NewTableReader; @@ -57,7 +60,7 @@ class BlockBasedTableFactory : public TableFactory { TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; // Valdates the specified DB Options. Status ValidateOptions(const DBOptions& db_opts, diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index e1f15057300..181e7824b11 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -230,7 +230,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { if (need_upper_bound_check_ && direction == IterDirection::kBackward) { - // Upper bound check isn't sufficnet for backward direction to + // Upper bound check isn't sufficient for backward direction to // guarantee the same result as total order, so disable prefix // check. return true; diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index be119194b6f..acb58138de6 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -15,8 +15,8 @@ #include #include +#include "cache/cache_entry_roles.h" #include "cache/sharded_cache.h" - #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "file/file_prefetch_buffer.h" @@ -24,6 +24,7 @@ #include "file/random_access_file_reader.h" #include "monitoring/perf_context_imp.h" #include "options/options_helper.h" +#include "port/lang.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" @@ -32,6 +33,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "table/block_based/binary_search_index_reader.h" @@ -39,7 +41,9 @@ #include "table/block_based/block_based_filter_block.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_iterator.h" +#include "table/block_based/block_like_traits.h" #include "table/block_based/block_prefix_index.h" +#include "table/block_based/block_type.h" #include "table/block_based/filter_block.h" #include "table/block_based/full_filter_block.h" #include "table/block_based/hash_index_reader.h" @@ -54,15 +58,11 @@ #include "table/persistent_cache_helper.h" #include "table/sst_file_writer_collectors.h" #include "table/two_level_iterator.h" - -#include "monitoring/perf_context_imp.h" -#include "port/lang.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { @@ -70,84 +70,12 @@ extern const uint64_t kBlockBasedTableMagicNumber; extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; -typedef BlockBasedTable::IndexReader IndexReader; - -// Found that 256 KB readahead size provides the best performance, based on -// experiments, for auto readahead. Experiment data is in PR #3282. -const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024; - BlockBasedTable::~BlockBasedTable() { delete rep_; } std::atomic BlockBasedTable::next_cache_key_id_(0); -template -class BlocklikeTraits; - -template <> -class BlocklikeTraits { - public: - static BlockContents* Create(BlockContents&& contents, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool /* using_zstd */, - const FilterPolicy* /* filter_policy */) { - return new BlockContents(std::move(contents)); - } - - static uint32_t GetNumRestarts(const BlockContents& /* contents */) { - return 0; - } -}; - -template <> -class BlocklikeTraits { - public: - static ParsedFullFilterBlock* Create(BlockContents&& contents, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool /* using_zstd */, - const FilterPolicy* filter_policy) { - return new ParsedFullFilterBlock(filter_policy, std::move(contents)); - } - - static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { - return 0; - } -}; - -template <> -class BlocklikeTraits { - public: - static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit, - Statistics* statistics, bool /* using_zstd */, - const FilterPolicy* /* filter_policy */) { - return new Block(std::move(contents), read_amp_bytes_per_bit, statistics); - } - - static uint32_t GetNumRestarts(const Block& block) { - return block.NumRestarts(); - } -}; - -template <> -class BlocklikeTraits { - public: - static UncompressionDict* Create(BlockContents&& contents, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool using_zstd, - const FilterPolicy* /* filter_policy */) { - return new UncompressionDict(contents.data, std::move(contents.allocation), - using_zstd); - } - - static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { - return 0; - } -}; - namespace { // Read the block identified by "handle" from "file". // The only relevant option is options.verify_checksums for now. @@ -159,7 +87,7 @@ template Status ReadBlockFromFile( RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - std::unique_ptr* result, const ImmutableCFOptions& ioptions, + std::unique_ptr* result, const ImmutableOptions& ioptions, bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit, @@ -175,20 +103,13 @@ Status ReadBlockFromFile( Status s = block_fetcher.ReadBlockContents(); if (s.ok()) { result->reset(BlocklikeTraits::Create( - std::move(contents), read_amp_bytes_per_bit, ioptions.statistics, - using_zstd, filter_policy)); + std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd, + filter_policy)); } return s; } -// Delete the entry resided in the cache. -template -void DeleteCachedEntry(const Slice& /*key*/, void* value) { - auto entry = reinterpret_cast(value); - delete entry; -} - // Release the cached entry and decrement its ref count. // Do not force erase void ReleaseCachedEntry(void* arg, void* h) { @@ -230,7 +151,7 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, size_t usage) const { - Statistics* const statistics = rep_->ioptions.statistics; + Statistics* const statistics = rep_->ioptions.stats; PERF_COUNTER_ADD(block_cache_hit_count, 1); PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, @@ -288,7 +209,7 @@ void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, GetContext* get_context) const { - Statistics* const statistics = rep_->ioptions.statistics; + Statistics* const statistics = rep_->ioptions.stats; // TODO: introduce aggregate (not per-level) block cache miss count PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, @@ -338,12 +259,9 @@ void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, } } -void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type, - GetContext* get_context, - size_t usage, - bool redundant) const { - Statistics* const statistics = rep_->ioptions.statistics; - +void BlockBasedTable::UpdateCacheInsertionMetrics( + BlockType block_type, GetContext* get_context, size_t usage, bool redundant, + Statistics* const statistics) { // TODO: introduce perf counters for block cache insertions if (get_context) { ++get_context->get_context_stats_.num_cache_add; @@ -432,9 +350,12 @@ void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type, } Cache::Handle* BlockBasedTable::GetEntryFromCache( - Cache* block_cache, const Slice& key, BlockType block_type, - GetContext* get_context) const { - auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics); + Cache* block_cache, const Slice& key, BlockType block_type, const bool wait, + GetContext* get_context, const Cache::CacheItemHelper* cache_helper, + const Cache::CreateCallback& create_cb, Cache::Priority priority) const { + auto cache_handle = + block_cache->Lookup(key, cache_helper, create_cb, priority, wait, + rep_->ioptions.statistics.get()); if (cache_handle != nullptr) { UpdateCacheHitMetrics(block_type, get_context, @@ -447,26 +368,29 @@ Cache::Handle* BlockBasedTable::GetEntryFromCache( } // Helper function to setup the cache key's prefix for the Table. -void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, + const std::string& db_session_id, + uint64_t cur_file_num) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; if (rep->table_options.block_cache != nullptr) { GenerateCachePrefix( rep->table_options.block_cache.get(), rep->file->file(), - &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); + &rep->cache_key_prefix[0], &rep->cache_key_prefix_size, db_session_id, + cur_file_num); } if (rep->table_options.persistent_cache != nullptr) { GenerateCachePrefix( rep->table_options.persistent_cache.get(), rep->file->file(), &rep->persistent_cache_key_prefix[0], - &rep->persistent_cache_key_prefix_size); + &rep->persistent_cache_key_prefix_size, "", cur_file_num); } if (rep->table_options.block_cache_compressed != nullptr) { GenerateCachePrefix( rep->table_options.block_cache_compressed.get(), rep->file->file(), &rep->compressed_cache_key_prefix[0], - &rep->compressed_cache_key_prefix_size); + &rep->compressed_cache_key_prefix_size, "", cur_file_num); } } @@ -577,7 +501,7 @@ Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, } Status BlockBasedTable::Open( - const ReadOptions& read_options, const ImmutableCFOptions& ioptions, + const ReadOptions& read_options, const ImmutableOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, @@ -588,7 +512,8 @@ Status BlockBasedTable::Open( const SequenceNumber largest_seqno, const bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, BlockCacheTracer* const block_cache_tracer, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, const std::string& db_session_id, + uint64_t cur_file_num) { table_reader->reset(); Status s; @@ -612,6 +537,10 @@ Status BlockBasedTable::Open( s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, &prefetch_buffer); + // Return error in prefetch path to users. + if (!s.ok()) { + return s; + } } else { // Should not prefetch for mmap mode. prefetch_buffer.reset(new FilePrefetchBuffer( @@ -627,7 +556,7 @@ Status BlockBasedTable::Open( // 6. [meta block: index] // 7. [meta block: filter] IOOptions opts; - s = PrepareIOFromReadOptions(ro, file->env(), opts); + s = file->PrepareIOOptions(ro, opts); if (s.ok()) { s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size, &footer, kBlockBasedTableMagicNumber); @@ -658,7 +587,7 @@ Status BlockBasedTable::Open( rep->internal_prefix_transform.reset( new InternalKeySliceTransform(prefix_extractor)); } - SetupCacheKeyPrefix(rep); + SetupCacheKeyPrefix(rep, db_session_id, cur_file_num); std::unique_ptr new_table( new BlockBasedTable(rep, block_cache_tracer)); @@ -667,7 +596,7 @@ Status BlockBasedTable::Open( PersistentCacheOptions(rep->table_options.persistent_cache, std::string(rep->persistent_cache_key_prefix, rep->persistent_cache_key_prefix_size), - rep->ioptions.statistics); + rep->ioptions.stats); // Meta-blocks are not dictionary compressed. Explicitly set the dictionary // handle to null, otherwise it may be seen as uninitialized during the below @@ -760,7 +689,7 @@ Status BlockBasedTable::PrefetchTail( // Use `FilePrefetchBuffer` prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true)); IOOptions opts; - Status s = PrepareIOFromReadOptions(ro, file->env(), opts); + Status s = file->PrepareIOOptions(ro, opts); if (s.ok()) { s = (*prefetch_buffer)->Prefetch(opts, file, prefetch_off, prefetch_len); } @@ -810,7 +739,7 @@ Status BlockBasedTable::ReadPropertiesBlock( s = SeekToPropertiesBlock(meta_iter, &found_properties_block); if (!s.ok()) { - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "Error when seeking to properties block from file: %s", s.ToString().c_str()); } else if (found_properties_block) { @@ -837,7 +766,7 @@ Status BlockBasedTable::ReadPropertiesBlock( } if (!s.ok()) { - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "Encountered error while reading data from properties " "block %s", s.ToString().c_str()); @@ -854,7 +783,7 @@ Status BlockBasedTable::ReadPropertiesBlock( CompressionTypeToString(kZSTDNotFinalCompression)); } } else { - ROCKS_LOG_ERROR(rep_->ioptions.info_log, + ROCKS_LOG_ERROR(rep_->ioptions.logger, "Cannot find Properties block from file."); } #ifndef ROCKSDB_LITE @@ -869,11 +798,10 @@ Status BlockBasedTable::ReadPropertiesBlock( rep_->whole_key_filtering &= IsFeatureSupported(*(rep_->table_properties), BlockBasedTablePropertyNames::kWholeKeyFiltering, - rep_->ioptions.info_log); - rep_->prefix_filtering &= - IsFeatureSupported(*(rep_->table_properties), - BlockBasedTablePropertyNames::kPrefixFiltering, - rep_->ioptions.info_log); + rep_->ioptions.logger); + rep_->prefix_filtering &= IsFeatureSupported( + *(rep_->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger); rep_->index_key_includes_seq = rep_->table_properties->index_key_is_user_key == 0; @@ -896,7 +824,7 @@ Status BlockBasedTable::ReadPropertiesBlock( s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, &(rep_->global_seqno)); if (!s.ok()) { - ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str()); + ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str()); } } return s; @@ -913,7 +841,7 @@ Status BlockBasedTable::ReadRangeDelBlock( s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); if (!s.ok()) { ROCKS_LOG_WARN( - rep_->ioptions.info_log, + rep_->ioptions.logger, "Error when seeking to range delete tombstones block from file: %s", s.ToString().c_str()); } else if (found_range_del_block && !range_del_handle.IsNull()) { @@ -925,7 +853,7 @@ Status BlockBasedTable::ReadRangeDelBlock( s = iter->status(); if (!s.ok()) { ROCKS_LOG_WARN( - rep_->ioptions.info_log, + rep_->ioptions.logger, "Encountered error while reading data from range del block %s", s.ToString().c_str()); IGNORE_STATUS_IF_ERROR(s); @@ -974,6 +902,9 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( } } } + // Partition filters cannot be enabled without partition indexes + assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter || + rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // Find compression dictionary handle bool found_compression_dict = false; @@ -987,20 +918,53 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( const bool use_cache = table_options.cache_index_and_filter_blocks; - // pin both index and filters, down to all partitions. - const bool pin_all = - rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && + const bool maybe_flushed = level == 0 && file_size <= max_file_size_for_l0_meta_pin; + std::function is_pinned = + [maybe_flushed, &is_pinned](PinningTier pinning_tier, + PinningTier fallback_pinning_tier) { + // Fallback to fallback would lead to infinite recursion. Disallow it. + assert(fallback_pinning_tier != PinningTier::kFallback); + + switch (pinning_tier) { + case PinningTier::kFallback: + return is_pinned(fallback_pinning_tier, + PinningTier::kNone /* fallback_pinning_tier */); + case PinningTier::kNone: + return false; + case PinningTier::kFlushedAndSimilar: + return maybe_flushed; + case PinningTier::kAll: + return true; + }; + + // In GCC, this is needed to suppress `control reaches end of non-void + // function [-Werror=return-type]`. + assert(false); + return false; + }; + const bool pin_top_level_index = is_pinned( + table_options.metadata_cache_options.top_level_index_pinning, + table_options.pin_top_level_index_and_filter ? PinningTier::kAll + : PinningTier::kNone); + const bool pin_partition = + is_pinned(table_options.metadata_cache_options.partition_pinning, + table_options.pin_l0_filter_and_index_blocks_in_cache + ? PinningTier::kFlushedAndSimilar + : PinningTier::kNone); + const bool pin_unpartitioned = + is_pinned(table_options.metadata_cache_options.unpartitioned_pinning, + table_options.pin_l0_filter_and_index_blocks_in_cache + ? PinningTier::kFlushedAndSimilar + : PinningTier::kNone); - // prefetch the first level of index - const bool prefetch_index = - prefetch_all || - (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); // pin the first level of index const bool pin_index = - pin_all || (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch + ? pin_top_level_index + : pin_unpartitioned; + // prefetch the first level of index + const bool prefetch_index = prefetch_all || pin_index; std::unique_ptr index_reader; s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache, @@ -1015,44 +979,43 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // The partitions of partitioned index are always stored in cache. They // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks - if (prefetch_all) { - s = rep_->index_reader->CacheDependencies(ro, pin_all); + if (prefetch_all || pin_partition) { + s = rep_->index_reader->CacheDependencies(ro, pin_partition); } if (!s.ok()) { return s; } - // prefetch the first level of filter - const bool prefetch_filter = - prefetch_all || - (table_options.pin_top_level_index_and_filter && - rep_->filter_type == Rep::FilterType::kPartitionedFilter); - // Partition fitlers cannot be enabled without partition indexes - assert(!prefetch_filter || prefetch_index); // pin the first level of filter const bool pin_filter = - pin_all || (table_options.pin_top_level_index_and_filter && - rep_->filter_type == Rep::FilterType::kPartitionedFilter); + rep_->filter_type == Rep::FilterType::kPartitionedFilter + ? pin_top_level_index + : pin_unpartitioned; + // prefetch the first level of filter + const bool prefetch_filter = prefetch_all || pin_filter; if (rep_->filter_policy) { auto filter = new_table->CreateFilterBlockReader( ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter, lookup_context); + if (filter) { // Refer to the comment above about paritioned indexes always being cached - if (prefetch_all) { - filter->CacheDependencies(ro, pin_all); + if (prefetch_all || pin_partition) { + s = filter->CacheDependencies(ro, pin_partition); + if (!s.ok()) { + return s; + } } - rep_->filter = std::move(filter); } } if (!rep_->compression_dict_handle.IsNull()) { std::unique_ptr uncompression_dict_reader; - s = UncompressionDictReader::Create(this, ro, prefetch_buffer, use_cache, - prefetch_all, pin_all, lookup_context, - &uncompression_dict_reader); + s = UncompressionDictReader::Create( + this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned, + pin_unpartitioned, lookup_context, &uncompression_dict_reader); if (!s.ok()) { return s; } @@ -1121,7 +1084,7 @@ Status BlockBasedTable::ReadMetaIndexBlock( nullptr /* filter_policy */); if (!s.ok()) { - ROCKS_LOG_ERROR(rep_->ioptions.info_log, + ROCKS_LOG_ERROR(rep_->ioptions.logger, "Encountered error while reading data from properties" " block %s", s.ToString().c_str()); @@ -1141,22 +1104,36 @@ Status BlockBasedTable::GetDataBlockFromCache( Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, - GetContext* get_context) const { + const bool wait, GetContext* get_context) const { const size_t read_amp_bytes_per_bit = block_type == BlockType::kData ? rep_->table_options.read_amp_bytes_per_bit : 0; assert(block); assert(block->IsEmpty()); + const Cache::Priority priority = + rep_->table_options.cache_index_and_filter_blocks_with_high_priority && + (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary || + block_type == BlockType::kIndex) + ? Cache::Priority::HIGH + : Cache::Priority::LOW; Status s; BlockContents* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; + Statistics* statistics = rep_->ioptions.statistics.get(); + bool using_zstd = rep_->blocks_definitely_zstd_compressed; + const FilterPolicy* filter_policy = rep_->filter_policy; + Cache::CreateCallback create_cb = GetCreateCallback( + read_amp_bytes_per_bit, statistics, using_zstd, filter_policy); // Lookup uncompressed cache first if (block_cache != nullptr) { - auto cache_handle = GetEntryFromCache(block_cache, block_cache_key, - block_type, get_context); + auto cache_handle = GetEntryFromCache( + block_cache, block_cache_key, block_type, wait, get_context, + BlocklikeTraits::GetCacheItemHelper(block_type), create_cb, + priority); if (cache_handle != nullptr) { block->SetCachedValue( reinterpret_cast(block_cache->Value(cache_handle)), @@ -1173,10 +1150,13 @@ Status BlockBasedTable::GetDataBlockFromCache( } assert(!compressed_block_cache_key.empty()); - block_cache_compressed_handle = - block_cache_compressed->Lookup(compressed_block_cache_key); - - Statistics* statistics = rep_->ioptions.statistics; + BlockContents contents; + Cache::CreateCallback create_cb_special = GetCreateCallback( + read_amp_bytes_per_bit, statistics, using_zstd, filter_policy); + block_cache_compressed_handle = block_cache_compressed->Lookup( + compressed_block_cache_key, + BlocklikeTraits::GetCacheItemHelper(block_type), + create_cb_special, priority, true); // if we found in the compressed cache, then uncompress and insert into // uncompressed cache @@ -1193,7 +1173,6 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(compression_type != kNoCompression); // Retrieve the uncompressed contents into a new buffer - BlockContents contents; UncompressionContext context(compression_type); UncompressionInfo info(context, uncompression_dict, compression_type); s = UncompressBlockContents( @@ -1201,7 +1180,8 @@ Status BlockBasedTable::GetDataBlockFromCache( &contents, rep_->table_options.format_version, rep_->ioptions, GetMemoryAllocator(rep_->table_options)); - // Insert uncompressed block into block cache + // Insert uncompressed block into block cache, the priority is based on the + // data block type. if (s.ok()) { std::unique_ptr block_holder( BlocklikeTraits::Create( @@ -1213,15 +1193,17 @@ Status BlockBasedTable::GetDataBlockFromCache( read_options.fill_cache) { size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; - s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle); + s = block_cache->Insert( + block_cache_key, block_holder.get(), + BlocklikeTraits::GetCacheItemHelper(block_type), charge, + &cache_handle, priority); if (s.ok()) { assert(cache_handle != nullptr); block->SetCachedValue(block_holder.release(), block_cache, cache_handle); UpdateCacheInsertionMetrics(block_type, get_context, charge, - s.IsOkOverwritten()); + s.IsOkOverwritten(), rep_->ioptions.stats); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); } @@ -1244,7 +1226,7 @@ Status BlockBasedTable::PutDataBlockToCache( const UncompressionDict& uncompression_dict, MemoryAllocator* memory_allocator, BlockType block_type, GetContext* get_context) const { - const ImmutableCFOptions& ioptions = rep_->ioptions; + const ImmutableOptions& ioptions = rep_->ioptions; const uint32_t format_version = rep_->table_options.format_version; const size_t read_amp_bytes_per_bit = block_type == BlockType::kData @@ -1261,7 +1243,7 @@ Status BlockBasedTable::PutDataBlockToCache( assert(cached_block->IsEmpty()); Status s; - Statistics* statistics = ioptions.statistics; + Statistics* statistics = ioptions.stats; std::unique_ptr block_holder; if (raw_block_comp_type != kNoCompression) { @@ -1303,8 +1285,8 @@ Status BlockBasedTable::PutDataBlockToCache( new BlockContents(std::move(*raw_block_contents)); s = block_cache_compressed->Insert( compressed_block_cache_key, block_cont_for_comp_cache, - block_cont_for_comp_cache->ApproximateMemoryUsage(), - &DeleteCachedEntry); + BlocklikeTraits::GetCacheItemHelper(block_type), + block_cont_for_comp_cache->ApproximateMemoryUsage()); if (s.ok()) { // Avoid the following code to delete this cached block. RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); @@ -1318,16 +1300,17 @@ Status BlockBasedTable::PutDataBlockToCache( if (block_cache != nullptr && block_holder->own_bytes()) { size_t charge = block_holder->ApproximateMemoryUsage(); Cache::Handle* cache_handle = nullptr; - s = block_cache->Insert(block_cache_key, block_holder.get(), charge, - &DeleteCachedEntry, &cache_handle, - priority); + s = block_cache->Insert( + block_cache_key, block_holder.get(), + BlocklikeTraits::GetCacheItemHelper(block_type), charge, + &cache_handle, priority); if (s.ok()) { assert(cache_handle != nullptr); cached_block->SetCachedValue(block_holder.release(), block_cache, cache_handle); UpdateCacheInsertionMetrics(block_type, get_context, charge, - s.IsOkOverwritten()); + s.IsOkOverwritten(), rep_->ioptions.stats); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); } @@ -1392,8 +1375,7 @@ DataBlockIter* BlockBasedTable::InitBlockIterator( DataBlockIter* input_iter, bool block_contents_pinned) { return block->NewDataIterator(rep->internal_comparator.user_comparator(), rep->get_global_seqno(block_type), input_iter, - rep->ioptions.statistics, - block_contents_pinned); + rep->ioptions.stats, block_contents_pinned); } template <> @@ -1402,7 +1384,7 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator( IndexBlockIter* input_iter, bool block_contents_pinned) { return block->NewIndexIterator( rep->internal_comparator.user_comparator(), - rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics, + rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats, /* total_order_seek */ true, rep->index_has_first_key, rep->index_key_includes_seq, rep->index_value_is_full, block_contents_pinned); @@ -1417,9 +1399,9 @@ template Status BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - BlockContents* contents) const { + const bool wait, CachableEntry* block_entry, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, BlockContents* contents) const { assert(block_entry != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); Cache* block_cache = rep_->table_options.block_cache.get(); @@ -1451,18 +1433,28 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( if (!contents) { s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, ro, block_entry, uncompression_dict, block_type, - get_context); - if (block_entry->GetValue()) { + wait, get_context); + // Value could still be null at this point, so check the cache handle + // and update the read pattern for prefetching + if (block_entry->GetValue() || block_entry->GetCacheHandle()) { // TODO(haoyu): Differentiate cache hit on uncompressed block cache and // compressed block cache. is_cache_hit = true; + if (prefetch_buffer) { + // Update the block details so that PrefetchBuffer can use the read + // pattern to determine if reads are sequential or not for + // prefetching. It should also take in account blocks read from cache. + prefetch_buffer->UpdateReadPattern(handle.offset(), + block_size(handle)); + } } } // Can't find the block from the cache. If I/O is allowed, read from the // file. - if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { - Statistics* statistics = rep_->ioptions.statistics; + if (block_entry->GetValue() == nullptr && + block_entry->GetCacheHandle() == nullptr && !no_io && ro.fill_cache) { + Statistics* statistics = rep_->ioptions.stats; const bool maybe_compressed = block_type != BlockType::kFilter && block_type != BlockType::kCompressionDictionary && @@ -1471,7 +1463,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( CompressionType raw_block_comp_type; BlockContents raw_block_contents; if (!contents) { - StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + StopWatch sw(rep_->ioptions.clock, statistics, READ_BLOCK_GET_MICROS); BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &raw_block_contents, rep_->ioptions, do_uncompress, @@ -1560,7 +1552,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( // Avoid making copy of block_key and cf_name when constructing the access // record. BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), + rep_->ioptions.clock->NowMicros(), /*block_key=*/"", trace_block_type, /*block_size=*/usage, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), @@ -1605,7 +1597,7 @@ void BlockBasedTable::RetrieveMultipleBlocks( char* scratch, const UncompressionDict& uncompression_dict) const { RandomAccessFileReader* file = rep_->file.get(); const Footer& footer = rep_->footer; - const ImmutableCFOptions& ioptions = rep_->ioptions; + const ImmutableOptions& ioptions = rep_->ioptions; size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); @@ -1624,7 +1616,8 @@ void BlockBasedTable::RetrieveMultipleBlocks( RetrieveBlock(nullptr, options, handle, uncompression_dict, &(*results)[idx_in_batch], BlockType::kData, mget_iter->get_context, &lookup_data_block_context, - /* for_compaction */ false, /* use_cache */ true); + /* for_compaction */ false, /* use_cache */ true, + /* wait_for_cache */ true); } return; } @@ -1703,7 +1696,7 @@ void BlockBasedTable::RetrieveMultipleBlocks( AlignedBuf direct_io_buf; { IOOptions opts; - IOStatus s = PrepareIOFromReadOptions(options, file->env(), opts); + IOStatus s = file->PrepareIOOptions(options, opts); if (s.IsTimedOut()) { for (FSReadRequest& req : read_reqs) { req.status = s; @@ -1731,6 +1724,9 @@ void BlockBasedTable::RetrieveMultipleBlocks( size_t& req_idx = req_idx_for_block[valid_batch_idx]; size_t& req_offset = req_offset_for_block[valid_batch_idx]; valid_batch_idx++; + if (mget_iter->get_context) { + ++(mget_iter->get_context->get_context_stats_.num_data_read); + } FSReadRequest& req = read_reqs[req_idx]; Status s = req.status; if (s.ok()) { @@ -1784,13 +1780,21 @@ void BlockBasedTable::RetrieveMultipleBlocks( if (s.ok()) { // When the blocks share the same underlying buffer (scratch or direct io - // buffer), if the block is compressed, the shared buffer will be - // uncompressed into heap during uncompressing; otherwise, we need to - // manually copy the block into heap before inserting the block to block - // cache. + // buffer), we may need to manually copy the block into heap if the raw + // block has to be inserted into a cache. That falls into th following + // cases - + // 1. Raw block is not compressed, it needs to be inserted into the + // uncompressed block cache if there is one + // 2. If the raw block is compressed, it needs to be inserted into the + // compressed block cache if there is one + // + // In all other cases, the raw block is either uncompressed into a heap + // buffer or there is no cache at all. CompressionType compression_type = raw_block_contents.get_compression_type(); - if (use_shared_buffer && compression_type == kNoCompression) { + if (use_shared_buffer && (compression_type == kNoCompression || + (compression_type != kNoCompression && + rep_->table_options.block_cache_compressed))) { Slice raw = Slice(req.result.data() + req_offset, block_size(handle)); raw_block_contents = BlockContents( CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw), @@ -1810,8 +1814,8 @@ void BlockBasedTable::RetrieveMultipleBlocks( // necessary. Since we're passing the raw block contents, it will // avoid looking up the block cache s = MaybeReadBlockAndLoadToCache( - nullptr, options, handle, uncompression_dict, block_entry, - BlockType::kData, mget_iter->get_context, + nullptr, options, handle, uncompression_dict, /*wait=*/true, + block_entry, BlockType::kData, mget_iter->get_context, &lookup_data_block_context, &raw_block_contents); // block_entry value could be null if no block cache is present, i.e @@ -1845,7 +1849,7 @@ void BlockBasedTable::RetrieveMultipleBlocks( } if (s.ok()) { (*results)[idx_in_batch].SetOwnedValue(new Block( - std::move(contents), read_amp_bytes_per_bit, ioptions.statistics)); + std::move(contents), read_amp_bytes_per_bit, ioptions.stats)); } } (*statuses)[idx_in_batch] = s; @@ -1858,22 +1862,23 @@ Status BlockBasedTable::RetrieveBlock( const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const { + bool for_compaction, bool use_cache, bool wait_for_cache) const { assert(block_entry); assert(block_entry->IsEmpty()); Status s; if (use_cache) { - s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, - uncompression_dict, block_entry, - block_type, get_context, lookup_context, - /*contents=*/nullptr); + s = MaybeReadBlockAndLoadToCache( + prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache, + block_entry, block_type, get_context, lookup_context, + /*contents=*/nullptr); if (!s.ok()) { return s; } - if (block_entry->GetValue() != nullptr) { + if (block_entry->GetValue() != nullptr || + block_entry->GetCacheHandle() != nullptr) { assert(s.ok()); return s; } @@ -1894,7 +1899,7 @@ Status BlockBasedTable::RetrieveBlock( std::unique_ptr block; { - StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, + StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, @@ -1941,28 +1946,28 @@ template Status BlockBasedTable::RetrieveBlock( const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; template Status BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, bool wait_for_cache) const; BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, @@ -1990,6 +1995,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( rep->index_value_is_full); } // Create an empty iterator + // TODO(ajkr): this is not the right way to handle an unpinned partition. return new IndexBlockIter(); } @@ -2026,8 +2032,10 @@ bool BlockBasedTable::PrefixMayMatch( } else { prefix_extractor = rep_->table_prefix_extractor.get(); } - auto user_key = ExtractUserKey(internal_key); - if (!prefix_extractor->InDomain(user_key)) { + auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); + auto user_key_without_ts = + ExtractUserKeyAndStripTimestamp(internal_key, ts_sz); + if (!prefix_extractor->InDomain(user_key_without_ts)) { return true; } @@ -2042,15 +2050,16 @@ bool BlockBasedTable::PrefixMayMatch( if (!filter->IsBlockBased()) { const Slice* const const_ikey_ptr = &internal_key; may_match = filter->RangeMayExist( - read_options.iterate_upper_bound, user_key, prefix_extractor, - rep_->internal_comparator.user_comparator(), const_ikey_ptr, - &filter_checked, need_upper_bound_check, no_io, lookup_context); + read_options.iterate_upper_bound, user_key_without_ts, + prefix_extractor, rep_->internal_comparator.user_comparator(), + const_ikey_ptr, &filter_checked, need_upper_bound_check, no_io, + lookup_context); } else { // if prefix_extractor changed for block based filter, skip filter if (need_upper_bound_check) { return true; } - auto prefix = prefix_extractor->Transform(user_key); + auto prefix = prefix_extractor->Transform(user_key_without_ts); InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -2104,7 +2113,7 @@ bool BlockBasedTable::PrefixMayMatch( } if (filter_checked) { - Statistics* statistics = rep_->ioptions.statistics; + Statistics* statistics = rep_->ioptions.stats; RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); @@ -2170,25 +2179,24 @@ bool BlockBasedTable::FullFilterKeyMayMatch( Slice user_key = ExtractUserKey(internal_key); const Slice* const const_ikey_ptr = &internal_key; bool may_match = true; + size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); if (rep_->whole_key_filtering) { - size_t ts_sz = - rep_->internal_comparator.user_comparator()->timestamp_size(); - Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); may_match = filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, no_io, const_ikey_ptr, get_context, lookup_context); } else if (!read_options.total_order_seek && prefix_extractor && rep_->table_properties->prefix_extractor_name.compare( prefix_extractor->Name()) == 0 && - prefix_extractor->InDomain(user_key) && - !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), - prefix_extractor, kNotValid, no_io, - const_ikey_ptr, get_context, - lookup_context)) { + prefix_extractor->InDomain(user_key_without_ts) && + !filter->PrefixMayMatch( + prefix_extractor->Transform(user_key_without_ts), + prefix_extractor, kNotValid, no_io, const_ikey_ptr, + get_context, lookup_context)) { may_match = false; } if (may_match) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); } return may_match; @@ -2209,14 +2217,13 @@ void BlockBasedTable::FullFilterKeysMayMatch( lookup_context); uint64_t after_keys = range->KeysLeft(); if (after_keys) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE, - after_keys); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys, rep_->level); } uint64_t filtered_keys = before_keys - after_keys; if (filtered_keys) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL, filtered_keys); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys, rep_->level); } @@ -2225,12 +2232,11 @@ void BlockBasedTable::FullFilterKeysMayMatch( prefix_extractor->Name()) == 0) { filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false, lookup_context); - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_PREFIX_CHECKED, - before_keys); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys); uint64_t after_keys = range->KeysLeft(); uint64_t filtered_keys = before_keys - after_keys; if (filtered_keys) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_PREFIX_USEFUL, + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL, filtered_keys); } } @@ -2266,7 +2272,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, get_context, &lookup_context); TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch"); if (!may_match) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); } else { IndexBlockIter iiter_on_stack; @@ -2303,15 +2309,16 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, // Not found // TODO: think about interaction with Merge. If a user key cannot // cross one data block, we should be fine. - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); break; } if (!v.first_internal_key.empty() && !skip_filters && UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0) { + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { // The requested key falls between highest key in previous block and // lowest key in current block. break; @@ -2354,8 +2361,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, // Call the *saver function on each entry/block until it returns false for (; biter.Valid(); biter.Next()) { ParsedInternalKey parsed_key; - if (ParseInternalKey(biter.key(), &parsed_key) != Status::OK()) { - s = Status::Corruption(Slice()); + Status pik_status = ParseInternalKey( + biter.key(), &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + s = pik_status; } if (!get_context->SaveValue( @@ -2382,7 +2391,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, referenced_key = key; } BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), + rep_->ioptions.clock->NowMicros(), /*block_key=*/"", lookup_data_block_context.block_type, lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), @@ -2408,7 +2417,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } if (matched && filter != nullptr && !filter->IsBlockBased()) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, rep_->level); } @@ -2475,6 +2484,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, { MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), sst_file_range.end()); + std::vector cache_handles; + bool wait_for_cache_results = false; CachableEntry uncompression_dict; Status uncompression_dict_status; @@ -2496,8 +2507,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, if (!iiter->Valid() || (!v.first_internal_key.empty() && !skip_filters && UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0)) { + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { // The requested key falls between highest key in previous block and // lowest key in current block. if (!iiter->status().IsNotFound()) { @@ -2546,20 +2558,61 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, Status s = RetrieveBlock( nullptr, ro, handle, dict, &(results.back()), BlockType::kData, miter->get_context, &lookup_data_block_context, - /* for_compaction */ false, /* use_cache */ true); + /* for_compaction */ false, /* use_cache */ true, + /* wait_for_cache */ false); if (s.IsIncomplete()) { s = Status::OK(); } if (s.ok() && !results.back().IsEmpty()) { - // Found it in the cache. Add NULL handle to indicate there is - // nothing to read from disk - block_handles.emplace_back(BlockHandle::NullBlockHandle()); + // Since we have a valid handle, check the value. If its nullptr, + // it means the cache is waiting for the final result and we're + // supposed to call WaitAll() to wait for the result. + if (results.back().GetValue() != nullptr) { + // Found it in the cache. Add NULL handle to indicate there is + // nothing to read from disk. + if (results.back().GetCacheHandle()) { + results.back().UpdateCachedValue(); + } + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + } else { + // We have to wait for the cache lookup to finish in the + // background, and then we may have to read the block from disk + // anyway + assert(results.back().GetCacheHandle()); + wait_for_cache_results = true; + block_handles.emplace_back(handle); + cache_handles.emplace_back(results.back().GetCacheHandle()); + } } else { block_handles.emplace_back(handle); total_len += block_size(handle); } } + if (wait_for_cache_results) { + Cache* block_cache = rep_->table_options.block_cache.get(); + block_cache->WaitAll(cache_handles); + for (size_t i = 0; i < block_handles.size(); ++i) { + // If this block was a success or failure or not needed because + // the corresponding key is in the same block as a prior key, skip + if (block_handles[i] == BlockHandle::NullBlockHandle() || + results[i].IsEmpty()) { + continue; + } + results[i].UpdateCachedValue(); + void* val = results[i].GetValue(); + if (!val) { + // The async cache lookup failed - could be due to an error + // or a false positive. We need to read the data block from + // the SST file + results[i].Reset(); + total_len += block_size(block_handles[i]); + } else { + block_handles[i] = BlockHandle::NullBlockHandle(); + } + } + } + if (total_len) { char* scratch = nullptr; const UncompressionDict& dict = uncompression_dict.GetValue() @@ -2635,8 +2688,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, IndexValue v = iiter->value(); if (!v.first_internal_key.empty() && !skip_filters && UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .Compare(ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0) { + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { // The requested key falls between highest key in previous block and // lowest key in current block. break; @@ -2678,8 +2732,10 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, ParsedInternalKey parsed_key; Cleanable dummy; Cleanable* value_pinner = nullptr; - if (ParseInternalKey(biter->key(), &parsed_key) != Status::OK()) { - s = Status::Corruption(Slice()); + Status pik_status = ParseInternalKey( + biter->key(), &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + s = pik_status; } if (biter->IsValuePinned()) { if (reusing_block) { @@ -2716,7 +2772,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, referenced_key = key; } BlockCacheTraceRecord access_record( - rep_->ioptions.env->NowMicros(), + rep_->ioptions.clock->NowMicros(), /*block_key=*/"", lookup_data_block_context.block_type, lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), /*cf_name=*/"", rep_->level_for_tracing(), @@ -2748,7 +2804,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options, } while (iiter->Valid()); if (matched && filter != nullptr && !filter->IsBlockBased()) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, rep_->level); } @@ -2869,11 +2925,11 @@ Status BlockBasedTable::VerifyChecksumInBlocks( // increasing of the buffer size. size_t readahead_size = (read_options.readahead_size != 0) ? read_options.readahead_size - : kMaxAutoReadaheadSize; + : rep_->table_options.max_auto_readahead_size; // FilePrefetchBuffer doesn't work in mmap mode and readahead is not // needed there. FilePrefetchBuffer prefetch_buffer( - rep_->file.get(), readahead_size /* readadhead_size */, + rep_->file.get(), readahead_size /* readahead_size */, readahead_size /* max_readahead_size */, !rep_->ioptions.allow_mmap_reads /* enable */); @@ -3040,7 +3096,7 @@ Status BlockBasedTable::CreateIndexReader( auto meta_index_iter = preloaded_meta_index_iter; bool should_fallback = false; if (rep_->internal_prefix_transform.get() == nullptr) { - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "No prefix extractor passed in. Fall back to binary" " search index."); should_fallback = true; @@ -3050,7 +3106,7 @@ Status BlockBasedTable::CreateIndexReader( if (!s.ok()) { // we simply fall back to binary search in case there is any // problem with prefix hash index loading. - ROCKS_LOG_WARN(rep_->ioptions.info_log, + ROCKS_LOG_WARN(rep_->ioptions.logger, "Unable to read the metaindex block." " Fall back to binary search index."); should_fallback = true; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 2923b482d34..43b56a68ca3 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -64,9 +64,6 @@ class BlockBasedTable : public TableReader { // All the below fields control iterator readahead static const size_t kInitAutoReadaheadSize = 8 * 1024; - // Found that 256 KB readahead size provides the best performance, based on - // experiments, for auto readahead. Experiment data is in PR #3282. - static const size_t kMaxAutoReadaheadSize; static const int kMinNumFileReadsToStartAutoReadahead = 2; // Attempt to open the table that is stored in bytes [0..file_size) @@ -87,7 +84,7 @@ class BlockBasedTable : public TableReader { // are set. // @param force_direct_prefetch if true, always prefetching to RocksDB // buffer, rather than calling RandomAccessFile::Prefetch(). - static Status Open(const ReadOptions& ro, const ImmutableCFOptions& ioptions, + static Status Open(const ReadOptions& ro, const ImmutableOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, @@ -102,7 +99,9 @@ class BlockBasedTable : public TableReader { bool force_direct_prefetch = false, TailPrefetchStats* tail_prefetch_stats = nullptr, BlockCacheTracer* const block_cache_tracer = nullptr, - size_t max_file_size_for_l0_meta_pin = 0); + size_t max_file_size_for_l0_meta_pin = 0, + const std::string& db_session_id = "", + uint64_t cur_file_num = 0); bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, @@ -218,6 +217,11 @@ class BlockBasedTable : public TableReader { size_t cache_key_prefix_size, const BlockHandle& handle, char* cache_key); + static void UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, size_t usage, + bool redundant, + Statistics* const statistics); + // Retrieve all key value pairs from data blocks in the table. // The key retrieved are internal keys. Status GetKVPairsFromDataBlocks(std::vector* kv_pair_blocks); @@ -268,12 +272,13 @@ class BlockBasedTable : public TableReader { size_t usage) const; void UpdateCacheMissMetrics(BlockType block_type, GetContext* get_context) const; - void UpdateCacheInsertionMetrics(BlockType block_type, - GetContext* get_context, size_t usage, - bool redundant) const; + Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, - BlockType block_type, - GetContext* get_context) const; + BlockType block_type, const bool wait, + GetContext* get_context, + const Cache::CacheItemHelper* cache_helper, + const Cache::CreateCallback& create_cb, + Cache::Priority priority) const; // Either Block::NewDataIterator() or Block::NewIndexIterator(). template @@ -295,9 +300,9 @@ class BlockBasedTable : public TableReader { Status MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - BlockContents* contents) const; + const bool wait, CachableEntry* block_entry, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, BlockContents* contents) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the @@ -309,7 +314,8 @@ class BlockBasedTable : public TableReader { CachableEntry* block_entry, BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache) const; + bool for_compaction, bool use_cache, + bool wait_for_cache) const; void RetrieveMultipleBlocks( const ReadOptions& options, const MultiGetRange* batch, @@ -349,7 +355,7 @@ class BlockBasedTable : public TableReader { Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, CachableEntry* block, const UncompressionDict& uncompression_dict, BlockType block_type, - GetContext* get_context) const; + const bool wait, GetContext* get_context) const; // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then @@ -446,20 +452,37 @@ class BlockBasedTable : public TableReader { bool use_cache, bool prefetch, bool pin, BlockCacheLookupContext* lookup_context); - static void SetupCacheKeyPrefix(Rep* rep); + static void SetupCacheKeyPrefix(Rep* rep, const std::string& db_session_id, + uint64_t cur_file_num); // Generate a cache key prefix from the file template static void GenerateCachePrefix(TCache* cc, TFile* file, char* buffer, - size_t* size) { + size_t* size, + const std::string& db_session_id, + uint64_t cur_file_num) { // generate an id from the file *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); // If the prefix wasn't generated or was too long, - // create one from the cache. + // create one based on the DbSessionId and curent file number if they + // are set. Otherwise, created from NewId() if (cc != nullptr && *size == 0) { - char* end = EncodeVarint64(buffer, cc->NewId()); - *size = static_cast(end - buffer); + if (db_session_id.size() == 20) { + // db_session_id is 20 bytes as defined. + memcpy(buffer, db_session_id.c_str(), 20); + char* end; + if (cur_file_num != 0) { + end = EncodeVarint64(buffer + 20, cur_file_num); + } else { + end = EncodeVarint64(buffer + 20, cc->NewId()); + } + // kMaxVarint64Length is 10 therefore, the prefix is at most 30 bytes. + *size = static_cast(end - buffer); + } else { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } } } @@ -505,7 +528,7 @@ class BlockBasedTable::PartitionedIndexIteratorState // Stores all the properties associated with a BlockBasedTable. // These are immutable. struct BlockBasedTable::Rep { - Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, + Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, const InternalKeyComparator& _internal_comparator, bool skip_filters, uint64_t _file_size, int _level, const bool _immortal_table) @@ -524,7 +547,7 @@ struct BlockBasedTable::Rep { level(_level), immortal_table(_immortal_table) {} ~Rep() { status.PermitUncheckedError(); } - const ImmutableCFOptions& ioptions; + const ImmutableOptions& ioptions; const EnvOptions& env_options; const BlockBasedTableOptions table_options; const FilterPolicy* const filter_policy; @@ -626,19 +649,23 @@ struct BlockBasedTable::Rep { uint64_t sst_number_for_tracing() const { return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; } - void CreateFilePrefetchBuffer( - size_t readahead_size, size_t max_readahead_size, - std::unique_ptr* fpb) const { - fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size, - max_readahead_size, - !ioptions.allow_mmap_reads /* enable */)); + void CreateFilePrefetchBuffer(size_t readahead_size, + size_t max_readahead_size, + std::unique_ptr* fpb, + bool implicit_auto_readahead) const { + fpb->reset(new FilePrefetchBuffer( + file.get(), readahead_size, max_readahead_size, + !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset*/, + implicit_auto_readahead)); } void CreateFilePrefetchBufferIfNotExists( size_t readahead_size, size_t max_readahead_size, - std::unique_ptr* fpb) const { + std::unique_ptr* fpb, + bool implicit_auto_readahead) const { if (!(*fpb)) { - CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb); + CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, + implicit_auto_readahead); } } }; @@ -655,13 +682,21 @@ class WritableFileStringStreamAdapter : public std::stringbuf { explicit WritableFileStringStreamAdapter(WritableFile* writable_file) : file_(writable_file) {} - // This is to handle `std::endl`, `endl` is written by `os.put()` directly - // without going through `xsputn()`. As we explicitly disabled buffering, - // every write, not captured by xsputn, is an overflow. + // Override overflow() to handle `sputc()`. There are cases that will not go + // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by + // `os.put()` directly and will call `sputc()` By internal implementation: + // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character + // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) : + // overflow(_Traits::to_int_type(_Ch)); + // } + // As we explicitly disabled buffering (_Pnavail() is always 0), every write, + // not captured by xsputn(), becomes an overflow here. int overflow(int ch = EOF) override { - if (ch == '\n') { - file_->Append("\n"); - return ch; + if (ch != EOF) { + Status s = file_->Append(Slice((char*)&ch, 1)); + if (s.ok()) { + return ch; + } } return EOF; } diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h index d9cfaa92ca5..603c6243153 100644 --- a/table/block_based/block_based_table_reader_impl.h +++ b/table/block_based/block_based_table_reader_impl.h @@ -54,7 +54,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( CachableEntry block; s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type, get_context, lookup_context, for_compaction, - /* use_cache */ true); + /* use_cache */ true, /* wait_for_cache */ true); if (!s.ok()) { assert(block.IsEmpty()); diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index be2af3195d9..07136dbf809 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -59,17 +59,17 @@ class BlockBasedTableReaderTest // Create table builder. Options options; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); InternalKeyComparator comparator(options.comparator); ColumnFamilyOptions cf_options; MutableCFOptions moptions(cf_options); - std::vector> factories; + IntTblPropCollectorFactories factories; std::unique_ptr table_builder(table_factory_->NewTableBuilder( TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, 0 /* sample_for_compression */, - CompressionOptions(), false /* skip_filters */, - kDefaultColumnFamilyName, -1 /* level */), - 0 /* column_family_id */, writer.get())); + compression_type, CompressionOptions(), + 0 /* column_family_id */, kDefaultColumnFamilyName, + -1 /* level */), + writer.get())); // Build table. for (auto it = kv.begin(); it != kv.end(); it++) { @@ -81,7 +81,7 @@ class BlockBasedTableReaderTest } void NewBlockBasedTableReader(const FileOptions& foptions, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const InternalKeyComparator& comparator, const std::string& table_name, std::unique_ptr* table) { @@ -135,7 +135,8 @@ class BlockBasedTableReaderTest std::string path = Path(filename); std::unique_ptr f; ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr)); - reader->reset(new RandomAccessFileReader(std::move(f), path, env_)); + reader->reset(new RandomAccessFileReader(std::move(f), path, + env_->GetSystemClock().get())); } std::string ToInternalKey(const std::string& key) { @@ -196,7 +197,7 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) { std::unique_ptr table; Options options; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); FileOptions foptions; foptions.use_direct_reads = use_direct_reads_; InternalKeyComparator comparator(options.comparator); @@ -274,7 +275,7 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { std::unique_ptr table; Options options; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); FileOptions foptions; foptions.use_direct_reads = use_direct_reads_; InternalKeyComparator comparator(options.comparator); @@ -299,7 +300,8 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { table.reset(); // Corrupt the block pointed to by handle - test::CorruptFile(Path(table_name), static_cast(handle.offset()), 128); + ASSERT_OK(test::CorruptFile(options.env, Path(table_name), + static_cast(handle.offset()), 128)); NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); Status s = table->VerifyChecksum(ReadOptions(), diff --git a/table/block_based/block_like_traits.h b/table/block_based/block_like_traits.h new file mode 100644 index 00000000000..ccfa8bc56b3 --- /dev/null +++ b/table/block_based/block_like_traits.h @@ -0,0 +1,225 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "cache/cache_entry_roles.h" +#include "port/lang.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +template +class BlocklikeTraits; + +template +Cache::CacheItemHelper* GetCacheItemHelperForRole(); + +template +Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit, + Statistics* statistics, bool using_zstd, + const FilterPolicy* filter_policy) { + return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy]( + void* buf, size_t size, void** out_obj, size_t* charge) -> Status { + assert(buf != nullptr); + std::unique_ptr buf_data(new char[size]()); + memcpy(buf_data.get(), buf, size); + BlockContents bc = BlockContents(std::move(buf_data), size); + TBlocklike* ucd_ptr = BlocklikeTraits::Create( + std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd, + filter_policy); + *out_obj = reinterpret_cast(ucd_ptr); + *charge = size; + return Status::OK(); + }; +} + +template <> +class BlocklikeTraits { + public: + static BlockContents* Create(BlockContents&& contents, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new BlockContents(std::move(contents)); + } + + static uint32_t GetNumRestarts(const BlockContents& /* contents */) { + return 0; + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + BlockContents* ptr = static_cast(obj); + return ptr->data.size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + BlockContents* ptr = static_cast(from_obj); + const char* buf = ptr->data.data(); + assert(length == ptr->data.size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + if (block_type == BlockType::kFilter) { + return GetCacheItemHelperForRole< + BlockContents, CacheEntryRole::kDeprecatedFilterBlock>(); + } else { + // E.g. compressed cache + return GetCacheItemHelperForRole(); + } + } +}; + +template <> +class BlocklikeTraits { + public: + static ParsedFullFilterBlock* Create(BlockContents&& contents, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* filter_policy) { + return new ParsedFullFilterBlock(filter_policy, std::move(contents)); + } + + static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { + return 0; + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + ParsedFullFilterBlock* ptr = static_cast(obj); + return ptr->GetBlockContentsData().size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + ParsedFullFilterBlock* ptr = static_cast(from_obj); + const char* buf = ptr->GetBlockContentsData().data(); + assert(length == ptr->GetBlockContentsData().size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + (void)block_type; + assert(block_type == BlockType::kFilter); + return GetCacheItemHelperForRole(); + } +}; + +template <> +class BlocklikeTraits { + public: + static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit, + Statistics* statistics, bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new Block(std::move(contents), read_amp_bytes_per_bit, statistics); + } + + static uint32_t GetNumRestarts(const Block& block) { + return block.NumRestarts(); + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + Block* ptr = static_cast(obj); + return ptr->size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + Block* ptr = static_cast(from_obj); + const char* buf = ptr->data(); + assert(length == ptr->size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + switch (block_type) { + case BlockType::kData: + return GetCacheItemHelperForRole(); + case BlockType::kIndex: + return GetCacheItemHelperForRole(); + case BlockType::kFilter: + return GetCacheItemHelperForRole(); + default: + // Not a recognized combination + assert(false); + FALLTHROUGH_INTENDED; + case BlockType::kRangeDeletion: + return GetCacheItemHelperForRole(); + } + } +}; + +template <> +class BlocklikeTraits { + public: + static UncompressionDict* Create(BlockContents&& contents, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool using_zstd, + const FilterPolicy* /* filter_policy */) { + return new UncompressionDict(contents.data, std::move(contents.allocation), + using_zstd); + } + + static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { + return 0; + } + + static size_t SizeCallback(void* obj) { + assert(obj != nullptr); + UncompressionDict* ptr = static_cast(obj); + return ptr->slice_.size(); + } + + static Status SaveToCallback(void* from_obj, size_t from_offset, + size_t length, void* out) { + assert(from_obj != nullptr); + UncompressionDict* ptr = static_cast(from_obj); + const char* buf = ptr->slice_.data(); + assert(length == ptr->slice_.size()); + (void)from_offset; + memcpy(out, buf, length); + return Status::OK(); + } + + static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { + (void)block_type; + assert(block_type == BlockType::kCompressionDictionary); + return GetCacheItemHelperForRole(); + } +}; + +// Get an CacheItemHelper pointer for value type T and role R. +template +Cache::CacheItemHelper* GetCacheItemHelperForRole() { + static Cache::CacheItemHelper cache_helper( + BlocklikeTraits::SizeCallback, BlocklikeTraits::SaveToCallback, + GetCacheEntryDeleterForRole()); + return &cache_helper; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index aa3fc36108d..d9ef162c63f 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -16,51 +16,83 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, if (is_for_compaction) { rep->CreateFilePrefetchBufferIfNotExists(compaction_readahead_size_, compaction_readahead_size_, - &prefetch_buffer_); + &prefetch_buffer_, false); return; } - // Explicit user requested readahead + // Explicit user requested readahead. if (readahead_size > 0) { rep->CreateFilePrefetchBufferIfNotExists(readahead_size, readahead_size, - &prefetch_buffer_); + &prefetch_buffer_, false); return; } + // Implicit readahead. + + // If max_auto_readahead_size is set to be 0 by user, no data will be + // prefetched. + size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size; + if (max_auto_readahead_size == 0) { + return; + } + + size_t len = static_cast(block_size(handle)); + size_t offset = handle.offset(); + + // If FS supports prefetching (readahead_limit_ will be non zero in that case) + // and current block exists in prefetch buffer then return. + if (offset + len <= readahead_limit_) { + UpdateReadPattern(offset, len); + return; + } + + if (!IsBlockSequential(offset)) { + UpdateReadPattern(offset, len); + ResetValues(); + return; + } + UpdateReadPattern(offset, len); + // Implicit auto readahead, which will be enabled if the number of reads - // reached `kMinNumFileReadsToStartAutoReadahead` (default: 2). + // reached `kMinNumFileReadsToStartAutoReadahead` (default: 2) and scans are + // sequential. num_file_reads_++; if (num_file_reads_ <= BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { return; } + size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize; + if (initial_auto_readahead_size > max_auto_readahead_size) { + initial_auto_readahead_size = max_auto_readahead_size; + } + if (rep->file->use_direct_io()) { - rep->CreateFilePrefetchBufferIfNotExists( - BlockBasedTable::kInitAutoReadaheadSize, - BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); + rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size, + max_auto_readahead_size, + &prefetch_buffer_, true); return; } - if (handle.offset() + static_cast(block_size(handle)) <= - readahead_limit_) { - return; + if (readahead_size_ > max_auto_readahead_size) { + readahead_size_ = max_auto_readahead_size; } // If prefetch is not supported, fall back to use internal prefetch buffer. // Discarding other return status of Prefetch calls intentionally, as // we can fallback to reading from disk if Prefetch fails. - Status s = rep->file->Prefetch(handle.offset(), readahead_size_); + Status s = rep->file->Prefetch(handle.offset(), + block_size(handle) + readahead_size_); if (s.IsNotSupported()) { - rep->CreateFilePrefetchBufferIfNotExists( - BlockBasedTable::kInitAutoReadaheadSize, - BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); + rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size, + max_auto_readahead_size, + &prefetch_buffer_, true); return; } - readahead_limit_ = static_cast(handle.offset() + readahead_size_); + + readahead_limit_ = offset + len + readahead_size_; // Keep exponentially increasing readahead size until - // kMaxAutoReadaheadSize. - readahead_size_ = - std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2); + // max_auto_readahead_size. + readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h index ee3b61f5c09..30b3d5eb2c1 100644 --- a/table/block_based/block_prefetcher.h +++ b/table/block_based/block_prefetcher.h @@ -19,6 +19,22 @@ class BlockPrefetcher { bool is_for_compaction); FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); } + void UpdateReadPattern(const size_t& offset, const size_t& len) { + prev_offset_ = offset; + prev_len_ = len; + } + + bool IsBlockSequential(const size_t& offset) { + return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); + } + + void ResetValues() { + num_file_reads_ = 1; + readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + readahead_limit_ = 0; + return; + } + private: // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. @@ -27,6 +43,8 @@ class BlockPrefetcher { size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; size_t readahead_limit_ = 0; int64_t num_file_reads_ = 0; + size_t prev_offset_ = 0; + size_t prev_len_ = 0; std::unique_ptr prefetch_buffer_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index 598f1ef5771..155097c0536 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -162,7 +162,6 @@ class CachableEntry { } void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { - assert(value != nullptr); assert(cache != nullptr); assert(cache_handle != nullptr); @@ -179,6 +178,22 @@ class CachableEntry { assert(!own_value_); } + void UpdateCachedValue() { + assert(cache_ != nullptr); + assert(cache_handle_ != nullptr); + + value_ = static_cast(cache_->Value(cache_handle_)); + } + + bool IsReady() { + if (!own_value_) { + assert(cache_ != nullptr); + assert(cache_handle_ != nullptr); + return cache_->IsReady(cache_handle_); + } + return true; + } + private: void ReleaseResource() { if (LIKELY(cache_handle_ != nullptr)) { diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index 94fa7e94f07..121f78cef4a 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -539,26 +539,27 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, int level_ = -1; std::vector keys; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); EnvOptions soptions; soptions.use_mmap_reads = ioptions.allow_mmap_reads; + test::StringSink* sink = new test::StringSink(); + std::unique_ptr f(sink); file_writer.reset( - test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions())); std::unique_ptr builder; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; builder.reset(ioptions.table_factory->NewTableBuilder( - TableBuilderOptions(ioptions, moptions, internal_comparator, - &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - CompressionOptions(), false /* skip_filters */, - column_family_name, level_), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions( + ioptions, moptions, internal_comparator, + &int_tbl_prop_collector_factories, options.compression, + CompressionOptions(), + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + column_family_name, level_), file_writer.get())); builder->Add(ik1.Encode().ToString(), v1); @@ -569,23 +570,20 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, file_writer->Flush(); EXPECT_TRUE(s.ok()) << s.ToString(); - EXPECT_EQ( - test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(), - builder->FileSize()); + EXPECT_EQ(sink->contents().size(), builder->FileSize()); // Open the table - file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource( - test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(), - 0 /*uniq_id*/, ioptions.allow_mmap_reads))); + test::StringSource* source = new test::StringSource( + sink->contents(), 0 /*uniq_id*/, ioptions.allow_mmap_reads); + std::unique_ptr file(source); + file_reader.reset(new RandomAccessFileReader(std::move(file), "test")); const bool kSkipFilters = true; const bool kImmortal = true; - ioptions.table_factory->NewTableReader( + ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, level_), - std::move(file_reader), - test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(), - &table_reader); + std::move(file_reader), sink->contents().size(), &table_reader)); // Search using Get() ReadOptions ro; diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index d94c7e606db..6f509c4f712 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -60,8 +60,11 @@ class FilterBlockBuilder { virtual bool IsBlockBased() = 0; // If is blockbased filter virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter - virtual void Add(const Slice& key) = 0; // Add a key to current filter - virtual size_t NumAdded() const = 0; // Number of keys added + virtual void Add( + const Slice& key_without_ts) = 0; // Add a key to current filter + virtual bool IsEmpty() const = 0; // Empty == none added + // For reporting stats on how many entries the builder considered unique + virtual size_t EstimateEntriesAdded() = 0; Slice Finish() { // Generate Filter const BlockHandle empty_handle; Status dont_care_status; @@ -108,11 +111,11 @@ class FilterBlockReader { uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { - const Slice ukey = iter->ukey; + const Slice ukey_without_ts = iter->ukey_without_ts; const Slice ikey = iter->ikey; GetContext* const get_context = iter->get_context; - if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, - get_context, lookup_context)) { + if (!KeyMayMatch(ukey_without_ts, prefix_extractor, block_offset, no_io, + &ikey, get_context, lookup_context)) { range->SkipKey(iter); } } @@ -133,13 +136,13 @@ class FilterBlockReader { uint64_t block_offset, const bool no_io, BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { - const Slice ukey = iter->ukey; + const Slice ukey_without_ts = iter->ukey_without_ts; const Slice ikey = iter->ikey; GetContext* const get_context = iter->get_context; - if (prefix_extractor->InDomain(ukey) && - !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, - block_offset, no_io, &ikey, get_context, - lookup_context)) { + if (prefix_extractor->InDomain(ukey_without_ts) && + !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), + prefix_extractor, block_offset, no_io, &ikey, + get_context, lookup_context)) { range->SkipKey(iter); } } @@ -153,10 +156,12 @@ class FilterBlockReader { return error_msg; } - virtual void CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) {} + virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) { + return Status::OK(); + } virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, - const Slice& user_key, + const Slice& user_key_without_ts, const SliceTransform* prefix_extractor, const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, @@ -167,7 +172,7 @@ class FilterBlockReader { return true; } *filter_checked = true; - Slice prefix = prefix_extractor->Transform(user_key); + Slice prefix = prefix_extractor->Transform(user_key_without_ts); return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io, const_ikey_ptr, /* get_context */ nullptr, lookup_context); diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc index fa0802669c6..135fffdf2df 100644 --- a/table/block_based/filter_block_reader_common.cc +++ b/table/block_based/filter_block_reader_common.cc @@ -30,7 +30,8 @@ Status FilterBlockReaderCommon::ReadFilterBlock( table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, UncompressionDict::GetEmptyDict(), filter_block, BlockType::kFilter, get_context, lookup_context, - /* for_compaction */ false, use_cache); + /* for_compaction */ false, use_cache, + /* wait_for_cache */ true); return s; } diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 31eb6b90df5..994ada1ba7e 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -7,119 +7,82 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "rocksdb/filter_policy.h" + #include #include - -#include "rocksdb/filter_policy.h" +#include #include "rocksdb/slice.h" #include "table/block_based/block_based_filter_block.h" -#include "table/block_based/full_filter_block.h" #include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" #include "third-party/folly/folly/ConstexprMath.h" #include "util/bloom_impl.h" #include "util/coding.h" #include "util/hash.h" +#include "util/ribbon_config.h" +#include "util/ribbon_impl.h" namespace ROCKSDB_NAMESPACE { namespace { -// See description in FastLocalBloomImpl -class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { +// Metadata trailer size for built-in filters. (This is separate from +// block-based table block trailer.) +// +// Originally this was 1 byte for num_probes and 4 bytes for number of +// cache lines in the Bloom filter, but now the first trailer byte is +// usually an implementation marker and remaining 4 bytes have various +// meanings. +static constexpr uint32_t kMetadataLen = 5; + +Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { + // Missing metadata, treated as zero entries + return Slice(nullptr, 0); +} + +// Base class for filter builders using the XXH3 preview hash, +// also known as Hash64 or GetSliceHash64. +class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder { public: - // Non-null aggregate_rounding_balance implies optimize_filters_for_memory - explicit FastLocalBloomBitsBuilder( - const int millibits_per_key, + explicit XXH3pFilterBitsBuilder( std::atomic* aggregate_rounding_balance) - : millibits_per_key_(millibits_per_key), - aggregate_rounding_balance_(aggregate_rounding_balance) { - assert(millibits_per_key >= 1000); - } - - // No Copy allowed - FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; - void operator=(const FastLocalBloomBitsBuilder&) = delete; + : aggregate_rounding_balance_(aggregate_rounding_balance) {} - ~FastLocalBloomBitsBuilder() override {} + ~XXH3pFilterBitsBuilder() override {} virtual void AddKey(const Slice& key) override { uint64_t hash = GetSliceHash64(key); + // Especially with prefixes, it is common to have repetition, + // though only adjacent repetition, which we want to immediately + // recognize and collapse for estimating true filter space + // requirements. if (hash_entries_.empty() || hash != hash_entries_.back()) { hash_entries_.push_back(hash); } } - virtual Slice Finish(std::unique_ptr* buf) override { - size_t num_entry = hash_entries_.size(); - std::unique_ptr mutable_buf; - uint32_t len_with_metadata = - CalculateAndAllocate(num_entry, &mutable_buf, /*update_balance*/ true); - - assert(mutable_buf); - assert(len_with_metadata >= 5); - - // Compute num_probes after any rounding / adjustments - int num_probes = GetNumProbes(num_entry, len_with_metadata); - - uint32_t len = len_with_metadata - 5; - if (len > 0) { - AddAllEntries(mutable_buf.get(), len, num_probes); - } - - assert(hash_entries_.empty()); - - // See BloomFilterPolicy::GetBloomBitsReader re: metadata - // -1 = Marker for newer Bloom implementations - mutable_buf[len] = static_cast(-1); - // 0 = Marker for this sub-implementation - mutable_buf[len + 1] = static_cast(0); - // num_probes (and 0 in upper bits for 64-byte block size) - mutable_buf[len + 2] = static_cast(num_probes); - // rest of metadata stays zero - - Slice rv(mutable_buf.get(), len_with_metadata); - *buf = std::move(mutable_buf); - return rv; + virtual size_t EstimateEntriesAdded() override { + return hash_entries_.size(); } - int CalculateNumEntry(const uint32_t bytes) override { - uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0; - return static_cast(uint64_t{8000} * bytes_no_meta / - millibits_per_key_); - } + protected: + static constexpr uint32_t kMetadataLen = 5; - uint32_t CalculateSpace(const int num_entry) override { - // NB: the BuiltinFilterBitsBuilder API presumes len fits in uint32_t. - return static_cast( - CalculateAndAllocate(static_cast(num_entry), - /* buf */ nullptr, - /*update_balance*/ false)); + // For delegating between XXH3pFilterBitsBuilders + void SwapEntriesWith(XXH3pFilterBitsBuilder* other) { + std::swap(hash_entries_, other->hash_entries_); } - // To choose size using malloc_usable_size, we have to actually allocate. - uint32_t CalculateAndAllocate(size_t num_entry, std::unique_ptr* buf, - bool update_balance) { - std::unique_ptr tmpbuf; - - // If not for cache line blocks in the filter, what would the target - // length in bytes be? - size_t raw_target_len = static_cast( - (uint64_t{num_entry} * millibits_per_key_ + 7999) / 8000); - - if (raw_target_len >= size_t{0xffffffc0}) { - // Max supported for this data structure implementation - raw_target_len = size_t{0xffffffc0}; - } - - // Round up to nearest multiple of 64 (block size). This adjustment is - // used for target FP rate only so that we don't receive complaints about - // lower FP rate vs. historic Bloom filter behavior. - uint32_t target_len = - static_cast(raw_target_len + 63) & ~uint32_t{63}; + virtual size_t RoundDownUsableSpace(size_t available_size) = 0; + // To choose size using malloc_usable_size, we have to actually allocate. + size_t AllocateMaybeRounding(size_t target_len_with_metadata, + size_t num_entries, + std::unique_ptr* buf) { // Return value set to a default; overwritten in some cases - uint32_t rv = target_len + /* metadata */ 5; + size_t rv = target_len_with_metadata; #ifdef ROCKSDB_MALLOC_USABLE_SIZE if (aggregate_rounding_balance_ != nullptr) { // Do optimize_filters_for_memory, using malloc_usable_size. @@ -131,7 +94,7 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { // allocation size. // Although it can be considered bad practice to use malloc_usable_size - // to access an object beyond its original size, this approach should + // to access an object beyond its original size, this approach should be // quite general: working for all allocators that properly support // malloc_usable_size. @@ -140,7 +103,8 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { // and relative. int64_t balance = aggregate_rounding_balance_->load(); - double target_fp_rate = EstimatedFpRate(num_entry, target_len + 5); + double target_fp_rate = + EstimatedFpRate(num_entries, target_len_with_metadata); double rv_fp_rate = target_fp_rate; if (balance < 0) { @@ -151,14 +115,17 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { // To simplify, we just try a few modified smaller sizes. This also // caps how much we vary filter size vs. target, to avoid outlier // behavior from excessive variance. - for (uint64_t maybe_len64 : + size_t target_len = target_len_with_metadata - kMetadataLen; + assert(target_len < target_len_with_metadata); // check underflow + for (uint64_t maybe_len_rough : {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16, uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) { - uint32_t maybe_len = - static_cast(maybe_len64) & ~uint32_t{63}; - double maybe_fp_rate = EstimatedFpRate(num_entry, maybe_len + 5); + size_t maybe_len_with_metadata = + RoundDownUsableSpace(maybe_len_rough + kMetadataLen); + double maybe_fp_rate = + EstimatedFpRate(num_entries, maybe_len_with_metadata); if (maybe_fp_rate <= for_balance_fp_rate) { - rv = maybe_len + /* metadata */ 5; + rv = maybe_len_with_metadata; rv_fp_rate = maybe_fp_rate; break; } @@ -168,12 +135,12 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { // Filter blocks are loaded into block cache with their block trailer. // We need to make sure that's accounted for in choosing a // fragmentation-friendly size. - const uint32_t kExtraPadding = kBlockTrailerSize; + const size_t kExtraPadding = kBlockTrailerSize; size_t requested = rv + kExtraPadding; // Allocate and get usable size - tmpbuf.reset(new char[requested]); - size_t usable = malloc_usable_size(tmpbuf.get()); + buf->reset(new char[requested]); + size_t usable = malloc_usable_size(buf->get()); if (usable - usable / 4 > requested) { // Ratio greater than 4/3 is too much for utilizing, if it's @@ -183,53 +150,148 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { // storage on disk. // Nothing to do, except assert that the result is accurate about // the usable size. (Assignment never used.) - assert((tmpbuf[usable - 1] = 'x')); + assert(((*buf)[usable - 1] = 'x')); } else if (usable > requested) { - // Adjust for reasonably larger usable size - size_t usable_len = (usable - kExtraPadding - /* metadata */ 5); - if (usable_len >= size_t{0xffffffc0}) { - // Max supported for this data structure implementation - usable_len = size_t{0xffffffc0}; - } - - rv = (static_cast(usable_len) & ~uint32_t{63}) + - /* metadata */ 5; - rv_fp_rate = EstimatedFpRate(num_entry, rv); + rv = RoundDownUsableSpace(usable - kExtraPadding); + assert(rv <= usable - kExtraPadding); + rv_fp_rate = EstimatedFpRate(num_entries, rv); } else { // Too small means bad malloc_usable_size assert(usable == requested); } - memset(tmpbuf.get(), 0, rv); + memset(buf->get(), 0, rv); - if (update_balance) { - int64_t diff = static_cast((rv_fp_rate - target_fp_rate) * - double{0x100000000}); - *aggregate_rounding_balance_ += diff; - } + // Update balance + int64_t diff = static_cast((rv_fp_rate - target_fp_rate) * + double{0x100000000}); + *aggregate_rounding_balance_ += diff; + } else { + buf->reset(new char[rv]()); } #else - (void)update_balance; + (void)num_entries; + buf->reset(new char[rv]()); #endif // ROCKSDB_MALLOC_USABLE_SIZE - if (buf) { - if (tmpbuf) { - *buf = std::move(tmpbuf); - } else { - buf->reset(new char[rv]()); - } + return rv; + } + + // A deque avoids unnecessary copying of already-saved values + // and has near-minimal peak memory use. + std::deque hash_entries_; + + // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, + // always "round up" like historic behavior. + std::atomic* aggregate_rounding_balance_; +}; + +// #################### FastLocalBloom implementation ################## // +// ############## also known as format_version=5 Bloom filter ########## // + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsBuilder : public XXH3pFilterBitsBuilder { + public: + // Non-null aggregate_rounding_balance implies optimize_filters_for_memory + explicit FastLocalBloomBitsBuilder( + const int millibits_per_key, + std::atomic* aggregate_rounding_balance) + : XXH3pFilterBitsBuilder(aggregate_rounding_balance), + millibits_per_key_(millibits_per_key) { + assert(millibits_per_key >= 1000); + } + + // No Copy allowed + FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; + void operator=(const FastLocalBloomBitsBuilder&) = delete; + + ~FastLocalBloomBitsBuilder() override {} + + virtual Slice Finish(std::unique_ptr* buf) override { + size_t num_entries = hash_entries_.size(); + size_t len_with_metadata = CalculateSpace(num_entries); + + std::unique_ptr mutable_buf; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + + assert(mutable_buf); + assert(len_with_metadata >= kMetadataLen); + + // Max size supported by implementation + assert(len_with_metadata <= 0xffffffffU); + + // Compute num_probes after any rounding / adjustments + int num_probes = GetNumProbes(num_entries, len_with_metadata); + + uint32_t len = static_cast(len_with_metadata - kMetadataLen); + if (len > 0) { + AddAllEntries(mutable_buf.get(), len, num_probes); } + + assert(hash_entries_.empty()); + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -1 = Marker for newer Bloom implementations + mutable_buf[len] = static_cast(-1); + // 0 = Marker for this sub-implementation + mutable_buf[len + 1] = static_cast(0); + // num_probes (and 0 in upper bits for 64-byte block size) + mutable_buf[len + 2] = static_cast(num_probes); + // rest of metadata stays zero + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); return rv; } + size_t ApproximateNumEntries(size_t bytes) override { + size_t bytes_no_meta = + bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0; + return static_cast(uint64_t{8000} * bytes_no_meta / + millibits_per_key_); + } + + size_t CalculateSpace(size_t num_entries) override { + // If not for cache line blocks in the filter, what would the target + // length in bytes be? + size_t raw_target_len = static_cast( + (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000); + + if (raw_target_len >= size_t{0xffffffc0}) { + // Max supported for this data structure implementation + raw_target_len = size_t{0xffffffc0}; + } + + // Round up to nearest multiple of 64 (block size). This adjustment is + // used for target FP rate only so that we don't receive complaints about + // lower FP rate vs. historic Bloom filter behavior. + return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen; + } + double EstimatedFpRate(size_t keys, size_t len_with_metadata) override { int num_probes = GetNumProbes(keys, len_with_metadata); return FastLocalBloomImpl::EstimatedFpRate( - keys, len_with_metadata - /*metadata*/ 5, num_probes, /*hash bits*/ 64); + keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64); + } + + protected: + size_t RoundDownUsableSpace(size_t available_size) override { + size_t rv = available_size - kMetadataLen; + + if (rv >= size_t{0xffffffc0}) { + // Max supported for this data structure implementation + rv = size_t{0xffffffc0}; + } + + // round down to multiple of 64 (block size) + rv &= ~size_t{63}; + + return rv + kMetadataLen; } private: // Compute num_probes after any rounding / adjustments int GetNumProbes(size_t keys, size_t len_with_metadata) { - uint64_t millibits = uint64_t{len_with_metadata - 5} * 8000; + uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000; int actual_millibits_per_key = static_cast(millibits / std::max(keys, size_t{1})); // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to @@ -291,12 +353,6 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { // Target allocation per added key, in thousandths of a bit. int millibits_per_key_; - // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, - // always "round up" like historic behavior. - std::atomic* aggregate_rounding_balance_; - // A deque avoids unnecessary copying of already-saved values - // and has near-minimal peak memory use. - std::deque hash_entries_; }; // See description in FastLocalBloomImpl @@ -341,6 +397,362 @@ class FastLocalBloomBitsReader : public FilterBitsReader { const uint32_t len_bytes_; }; +// ##################### Ribbon filter implementation ################### // + +// Implements concept RehasherTypesAndSettings in ribbon_impl.h +struct Standard128RibbonRehasherTypesAndSettings { + // These are schema-critical. Any change almost certainly changes + // underlying data. + static constexpr bool kIsFilter = true; + static constexpr bool kHomogeneous = false; + static constexpr bool kFirstCoeffAlwaysOne = true; + static constexpr bool kUseSmash = false; + using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; + using Hash = uint64_t; + using Seed = uint32_t; + // Changing these doesn't necessarily change underlying data, + // but might affect supported scalability of those dimensions. + using Index = uint32_t; + using ResultRow = uint32_t; + // Save a conditional in Ribbon queries + static constexpr bool kAllowZeroStarts = false; +}; + +using Standard128RibbonTypesAndSettings = + ribbon::StandardRehasherAdapter; + +class Standard128RibbonBitsBuilder : public XXH3pFilterBitsBuilder { + public: + explicit Standard128RibbonBitsBuilder( + double desired_one_in_fp_rate, int bloom_millibits_per_key, + std::atomic* aggregate_rounding_balance, Logger* info_log) + : XXH3pFilterBitsBuilder(aggregate_rounding_balance), + desired_one_in_fp_rate_(desired_one_in_fp_rate), + info_log_(info_log), + bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) { + assert(desired_one_in_fp_rate >= 1.0); + } + + // No Copy allowed + Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete; + void operator=(const Standard128RibbonBitsBuilder&) = delete; + + ~Standard128RibbonBitsBuilder() override {} + + virtual Slice Finish(std::unique_ptr* buf) override { + if (hash_entries_.size() > kMaxRibbonEntries) { + ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu", + static_cast(hash_entries_.size())); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + return bloom_fallback_.Finish(buf); + } + if (hash_entries_.size() == 0) { + // Save a conditional in Ribbon queries by using alternate reader + // for zero entries added. + return FinishAlwaysFalse(buf); + } + uint32_t num_entries = static_cast(hash_entries_.size()); + uint32_t num_slots; + size_t len_with_metadata; + + CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots); + + // Bloom fall-back indicator + if (num_slots == 0) { + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + return bloom_fallback_.Finish(buf); + } + + uint32_t entropy = 0; + if (!hash_entries_.empty()) { + entropy = Lower32of64(hash_entries_.front()); + } + + BandingType banding; + bool success = banding.ResetAndFindSeedToSolve( + num_slots, hash_entries_.begin(), hash_entries_.end(), + /*starting seed*/ entropy & 255, /*seed mask*/ 255); + if (!success) { + ROCKS_LOG_WARN(info_log_, + "Too many re-seeds (256) for Ribbon filter, %llu / %llu", + static_cast(hash_entries_.size()), + static_cast(num_slots)); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + return bloom_fallback_.Finish(buf); + } + hash_entries_.clear(); + + uint32_t seed = banding.GetOrdinalSeed(); + assert(seed < 256); + + std::unique_ptr mutable_buf; + len_with_metadata = + AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + + SolnType soln(mutable_buf.get(), len_with_metadata); + soln.BackSubstFrom(banding); + uint32_t num_blocks = soln.GetNumBlocks(); + // This should be guaranteed: + // num_entries < 2^30 + // => (overhead_factor < 2.0) + // num_entries * overhead_factor == num_slots < 2^31 + // => (num_blocks = num_slots / 128) + // num_blocks < 2^24 + assert(num_blocks < 0x1000000U); + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -2 = Marker for Standard128 Ribbon + mutable_buf[len_with_metadata - 5] = static_cast(-2); + // Hash seed + mutable_buf[len_with_metadata - 4] = static_cast(seed); + // Number of blocks, in 24 bits + // (Along with bytes, we can derive other settings) + mutable_buf[len_with_metadata - 3] = static_cast(num_blocks & 255); + mutable_buf[len_with_metadata - 2] = + static_cast((num_blocks >> 8) & 255); + mutable_buf[len_with_metadata - 1] = + static_cast((num_blocks >> 16) & 255); + + Slice rv(mutable_buf.get(), len_with_metadata); + *buf = std::move(mutable_buf); + return rv; + } + + // Setting num_slots to 0 means "fall back on Bloom filter." + // And note this implementation does not support num_entries or num_slots + // beyond uint32_t; see kMaxRibbonEntries. + void CalculateSpaceAndSlots(size_t num_entries, + size_t* target_len_with_metadata, + uint32_t* num_slots) { + if (num_entries > kMaxRibbonEntries) { + // More entries than supported by this Ribbon + *num_slots = 0; // use Bloom + *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries); + return; + } + uint32_t entropy = 0; + if (!hash_entries_.empty()) { + entropy = Upper32of64(hash_entries_.front()); + } + + *num_slots = NumEntriesToNumSlots(static_cast(num_entries)); + *target_len_with_metadata = + SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_, + /*rounding*/ entropy) + + kMetadataLen; + + // Consider possible Bloom fallback for small filters + if (*num_slots < 1024) { + size_t bloom = bloom_fallback_.CalculateSpace(num_entries); + if (bloom < *target_len_with_metadata) { + *num_slots = 0; // use Bloom + *target_len_with_metadata = bloom; + return; + } + } + } + + size_t CalculateSpace(size_t num_entries) override { + if (num_entries == 0) { + // See FinishAlwaysFalse + return 0; + } + size_t target_len_with_metadata; + uint32_t num_slots; + CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots); + (void)num_slots; + return target_len_with_metadata; + } + + // This is a somewhat ugly but reasonably fast and reasonably accurate + // reversal of CalculateSpace. + size_t ApproximateNumEntries(size_t bytes) override { + size_t len_no_metadata = + RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) - + kMetadataLen; + + if (!(desired_one_in_fp_rate_ > 1.0)) { + // Effectively asking for 100% FP rate, or NaN etc. + // Note that NaN is neither < 1.0 nor > 1.0 + return kMaxRibbonEntries; + } + + // Find a slight under-estimate for actual average bits per slot + double min_real_bits_per_slot; + if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits::max()) { + // Max of 32 solution columns (result bits) + min_real_bits_per_slot = 32.0; + } else { + // Account for mix of b and b+1 solution columns being slightly + // suboptimal vs. ideal log2(1/fp_rate) bits. + uint32_t rounded = static_cast(desired_one_in_fp_rate_); + int upper_bits_per_key = 1 + FloorLog2(rounded); + double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key); + double portion_lower = + (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) / + fp_rate_for_upper; + min_real_bits_per_slot = upper_bits_per_key - portion_lower; + assert(min_real_bits_per_slot > 0.0); + assert(min_real_bits_per_slot <= 32.0); + } + + // An overestimate, but this should only be O(1) slots away from truth. + double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot; + + // Let's not bother accounting for overflow to Bloom filter + // (Includes NaN case) + if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) { + return kMaxRibbonEntries; + } + + // Set up for short iteration + uint32_t slots = static_cast(max_slots); + slots = SolnType::RoundUpNumSlots(slots); + + // Assert that we have a valid upper bound on slots + assert(SolnType::GetBytesForOneInFpRate( + SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_, + /*rounding*/ 0) > len_no_metadata); + + // Iterate up to a few times to rather precisely account for small effects + for (int i = 0; slots > 0; ++i) { + size_t reqd_bytes = + SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_, + /*rounding*/ 0); + if (reqd_bytes <= len_no_metadata) { + break; // done + } + if (i >= 2) { + // should have been enough iterations + assert(false); + break; + } + slots = SolnType::RoundDownNumSlots(slots - 1); + } + + uint32_t num_entries = ConfigHelper::GetNumToAdd(slots); + + // Consider possible Bloom fallback for small filters + if (slots < 1024) { + size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes); + if (bloom > num_entries) { + return bloom; + } else { + return num_entries; + } + } else { + return std::min(num_entries, kMaxRibbonEntries); + } + } + + double EstimatedFpRate(size_t num_entries, + size_t len_with_metadata) override { + if (num_entries > kMaxRibbonEntries) { + // More entries than supported by this Ribbon + return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata); + } + uint32_t num_slots = + NumEntriesToNumSlots(static_cast(num_entries)); + SolnType fake_soln(nullptr, len_with_metadata); + fake_soln.ConfigureForNumSlots(num_slots); + return fake_soln.ExpectedFpRate(); + } + + protected: + size_t RoundDownUsableSpace(size_t available_size) override { + size_t rv = available_size - kMetadataLen; + + // round down to multiple of 16 (segment size) + rv &= ~size_t{15}; + + return rv + kMetadataLen; + } + + private: + using TS = Standard128RibbonTypesAndSettings; + using SolnType = ribbon::SerializableInterleavedSolution; + using BandingType = ribbon::StandardBanding; + using ConfigHelper = ribbon::BandingConfigHelper1TS; + + static uint32_t NumEntriesToNumSlots(uint32_t num_entries) { + uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries); + return SolnType::RoundUpNumSlots(num_slots1); + } + + // Approximate num_entries to ensure number of bytes fits in 32 bits, which + // is not an inherent limitation but does ensure somewhat graceful Bloom + // fallback for crazy high number of entries, since the Bloom implementation + // does not support number of bytes bigger than fits in 32 bits. This is + // within an order of magnitude of implementation limit on num_slots + // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits + // (for filter metadata). + static constexpr uint32_t kMaxRibbonEntries = 950000000; // ~ 1 billion + + // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate. + double desired_one_in_fp_rate_; + + // For warnings, or can be nullptr + Logger* info_log_; + + // For falling back on Bloom filter in some exceptional cases and + // very small filter cases + FastLocalBloomBitsBuilder bloom_fallback_; +}; + +// for the linker, at least with DEBUG_LEVEL=2 +constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries; + +class Standard128RibbonBitsReader : public FilterBitsReader { + public: + Standard128RibbonBitsReader(const char* data, size_t len_bytes, + uint32_t num_blocks, uint32_t seed) + : soln_(const_cast(data), len_bytes) { + soln_.ConfigureForNumBlocks(num_blocks); + hasher_.SetOrdinalSeed(seed); + } + + // No Copy allowed + Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete; + void operator=(const Standard128RibbonBitsReader&) = delete; + + ~Standard128RibbonBitsReader() override {} + + bool MayMatch(const Slice& key) override { + uint64_t h = GetSliceHash64(key); + return soln_.FilterQuery(h, hasher_); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + struct SavedData { + uint64_t seeded_hash; + uint32_t segment_num; + uint32_t num_columns; + uint32_t start_bits; + }; + std::array saved; + for (int i = 0; i < num_keys; ++i) { + ribbon::InterleavedPrepareQuery( + GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash, + &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = ribbon::InterleavedFilterQuery( + saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns, + saved[i].start_bits, hasher_, soln_); + } + } + + private: + using TS = Standard128RibbonTypesAndSettings; + ribbon::SerializableInterleavedSolution soln_; + ribbon::StandardHasher hasher_; +}; + +// ##################### Legacy Bloom implementation ################### // + using LegacyBloomImpl = LegacyLocalityBloomImpl; class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { @@ -355,21 +767,25 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { void AddKey(const Slice& key) override; - Slice Finish(std::unique_ptr* buf) override; + virtual size_t EstimateEntriesAdded() override { + return hash_entries_.size(); + } - int CalculateNumEntry(const uint32_t bytes) override; + Slice Finish(std::unique_ptr* buf) override; - uint32_t CalculateSpace(const int num_entry) override { + size_t CalculateSpace(size_t num_entries) override { uint32_t dont_care1; uint32_t dont_care2; - return CalculateSpace(num_entry, &dont_care1, &dont_care2); + return CalculateSpace(num_entries, &dont_care1, &dont_care2); } double EstimatedFpRate(size_t keys, size_t bytes) override { - return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5, + return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen, num_probes_); } + size_t ApproximateNumEntries(size_t bytes) override; + private: int bits_per_key_; int num_probes_; @@ -380,11 +796,11 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { uint32_t GetTotalBitsForLocality(uint32_t total_bits); // Reserve space for new filter - char* ReserveSpace(const int num_entry, uint32_t* total_bits, + char* ReserveSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines); // Implementation-specific variant of public CalculateSpace - uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines); // Assuming single threaded access to this function. @@ -452,7 +868,29 @@ Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { buf->reset(const_data); hash_entries_.clear(); - return Slice(data, total_bits / 8 + 5); + return Slice(data, total_bits / 8 + kMetadataLen); +} + +size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) { + assert(bits_per_key_); + assert(bytes > 0); + + uint64_t total_bits_tmp = bytes * 8; + // total bits, including temporary computations, cannot exceed 2^32 + // for compatibility + total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000}); + + uint32_t high = static_cast(total_bits_tmp) / + static_cast(bits_per_key_) + + 1; + uint32_t low = 1; + uint32_t n = high; + for (; n >= low; n--) { + if (CalculateSpace(n) <= bytes) { + break; + } + } + return n; } uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { @@ -467,14 +905,18 @@ uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { return num_lines * (CACHE_LINE_SIZE * 8); } -uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry, +uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines) { assert(bits_per_key_); - if (num_entry != 0) { - uint32_t total_bits_tmp = static_cast(num_entry * bits_per_key_); - - *total_bits = GetTotalBitsForLocality(total_bits_tmp); + if (num_entries != 0) { + size_t total_bits_tmp = num_entries * bits_per_key_; + // total bits, including temporary computations, cannot exceed 2^32 + // for compatibility + total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000}); + + *total_bits = + GetTotalBitsForLocality(static_cast(total_bits_tmp)); *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); assert(*total_bits > 0 && *total_bits % 8 == 0); } else { @@ -485,34 +927,19 @@ uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry, // Reserve space for Filter uint32_t sz = *total_bits / 8; - sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + sz += kMetadataLen; // 4 bytes for num_lines, 1 byte for num_probes return sz; } -char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry, +char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries, uint32_t* total_bits, uint32_t* num_lines) { - uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); + uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines); char* data = new char[sz]; memset(data, 0, sz); return data; } -int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) { - assert(bits_per_key_); - assert(bytes > 0); - int high = static_cast(bytes * 8 / bits_per_key_ + 1); - int low = 1; - int n = high; - for (; n >= low; n--) { - if (CalculateSpace(n) <= bytes) { - break; - } - } - assert(n < high); // High should be an overestimation - return n; -} - inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits) { @@ -595,11 +1022,13 @@ const std::vector BloomFilterPolicy::kAllFixedImpls = { kLegacyBloom, kDeprecatedBlock, kFastLocalBloom, + kStandard128Ribbon, }; const std::vector BloomFilterPolicy::kAllUserModes = { kDeprecatedBlock, - kAuto, + kAutoBloom, + kStandard128Ribbon, }; BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) @@ -616,6 +1045,15 @@ BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) // point are interpreted accurately. millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); + // For now configure Ribbon filter to match Bloom FP rate and save + // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key + // for same FP rate.) + desired_one_in_fp_rate_ = + 1.0 / BloomMath::CacheLocalFpRate( + bits_per_key, + FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_), + /*cache_line_bits*/ 512); + // For better or worse, this is a rounding up of a nudged rounding up, // e.g. 7.4999999999999 will round up to 8, but that provides more // predictability against small arithmetic errors in floating point. @@ -700,7 +1138,7 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( // one exhaustive switch without (risky) recursion for (int i = 0; i < 2; ++i) { switch (cur) { - case kAuto: + case kAutoBloom: if (context.table_options.format_version < 5) { cur = kLegacyBloom; } else { @@ -733,6 +1171,10 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( } return new LegacyBloomBitsBuilder(whole_bits_per_key_, context.info_log); + case kStandard128Ribbon: + return new Standard128RibbonBitsBuilder( + desired_one_in_fp_rate_, millibits_per_key_, + offm ? &aggregate_rounding_balance_ : nullptr, context.info_log); } } assert(false); @@ -753,7 +1195,7 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( const Slice& contents) const { uint32_t len_with_meta = static_cast(contents.size()); - if (len_with_meta <= 5) { + if (len_with_meta <= kMetadataLen) { // filter is empty or broken. Treat like zero keys added. return new AlwaysFalseFilter(); } @@ -771,7 +1213,7 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( // len_with_meta +-----------------------------------+ int8_t raw_num_probes = - static_cast(contents.data()[len_with_meta - 5]); + static_cast(contents.data()[len_with_meta - kMetadataLen]); // NB: *num_probes > 30 and < 128 probably have not been used, because of // BloomFilterPolicy::initialize, unless directly calling // LegacyBloomBitsBuilder as an API, but we are leaving those cases in @@ -780,13 +1222,20 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( if (raw_num_probes < 1) { // Note: < 0 (or unsigned > 127) indicate special new implementations // (or reserved for future use) - if (raw_num_probes == -1) { - // Marker for newer Bloom implementations - return GetBloomBitsReader(contents); + switch (raw_num_probes) { + case 0: + // Treat as zero probes (always FP) + return new AlwaysTrueFilter(); + case -1: + // Marker for newer Bloom implementations + return GetBloomBitsReader(contents); + case -2: + // Marker for Ribbon implementations + return GetRibbonBitsReader(contents); + default: + // Reserved (treat as zero probes, always FP, for now) + return new AlwaysTrueFilter(); } - // otherwise - // Treat as zero probes (always FP) for now. - return new AlwaysTrueFilter(); } // else attempt decode for LegacyBloomBitsReader @@ -794,7 +1243,7 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( assert(num_probes >= 1); assert(num_probes <= 127); - uint32_t len = len_with_meta - 5; + uint32_t len = len_with_meta - kMetadataLen; assert(len > 0); uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); @@ -824,11 +1273,34 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( log2_cache_line_size); } +FilterBitsReader* BloomFilterPolicy::GetRibbonBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + uint32_t len = len_with_meta - kMetadataLen; + + assert(len > 0); // precondition + + uint32_t seed = static_cast(contents.data()[len + 1]); + uint32_t num_blocks = static_cast(contents.data()[len + 2]); + num_blocks |= static_cast(contents.data()[len + 3]) << 8; + num_blocks |= static_cast(contents.data()[len + 4]) << 16; + if (num_blocks < 2) { + // Not supported + // num_blocks == 1 is not used because num_starts == 1 is problematic + // for the hashing scheme. num_blocks == 0 is unused because there's + // already a concise encoding of an "always false" filter. + // Return something safe: + return new AlwaysTrueFilter(); + } + return new Standard128RibbonBitsReader(contents.data(), len, num_blocks, + seed); +} + // For newer Bloom filter implementations FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( const Slice& contents) const { uint32_t len_with_meta = static_cast(contents.size()); - uint32_t len = len_with_meta - 5; + uint32_t len = len_with_meta - kMetadataLen; assert(len > 0); // precondition @@ -890,7 +1362,7 @@ const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, if (use_block_based_builder) { m = BloomFilterPolicy::kDeprecatedBlock; } else { - m = BloomFilterPolicy::kAuto; + m = BloomFilterPolicy::kAutoBloom; } assert(std::find(BloomFilterPolicy::kAllUserModes.begin(), BloomFilterPolicy::kAllUserModes.end(), @@ -898,6 +1370,12 @@ const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, return new BloomFilterPolicy(bits_per_key, m); } +extern const FilterPolicy* NewRibbonFilterPolicy( + double bloom_equivalent_bits_per_key) { + return new BloomFilterPolicy(bloom_equivalent_bits_per_key, + BloomFilterPolicy::kStandard128Ribbon); +} + FilterBuildingContext::FilterBuildingContext( const BlockBasedTableOptions& _table_options) : table_options(_table_options) {} @@ -908,6 +1386,8 @@ Status FilterPolicy::CreateFromString( const ConfigOptions& /*options*/, const std::string& value, std::shared_ptr* policy) { const std::string kBloomName = "bloomfilter:"; + const std::string kExpRibbonName = "experimental_ribbon:"; + const std::string kRibbonName = "ribbonfilter:"; if (value == kNullptrString || value == "rocksdb.BuiltinBloomFilter") { policy->reset(); #ifndef ROCKSDB_LITE @@ -924,6 +1404,15 @@ Status FilterPolicy::CreateFromString( policy->reset( NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); } + } else if (value.compare(0, kExpRibbonName.size(), kExpRibbonName) == 0) { + double bloom_equivalent_bits_per_key = + ParseDouble(trim(value.substr(kExpRibbonName.size()))); + policy->reset( + NewExperimentalRibbonFilterPolicy(bloom_equivalent_bits_per_key)); + } else if (value.compare(0, kRibbonName.size(), kRibbonName) == 0) { + double bloom_equivalent_bits_per_key = + ParseDouble(trim(value.substr(kRibbonName.size()))); + policy->reset(NewRibbonFilterPolicy(bloom_equivalent_bits_per_key)); } else { return Status::NotFound("Invalid filter policy name ", value); #else diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 783373b2627..1a8acfc9d4f 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -25,14 +25,17 @@ class Slice; class BuiltinFilterBitsBuilder : public FilterBitsBuilder { public: // Calculate number of bytes needed for a new filter, including - // metadata. Passing the result to CalculateNumEntry should - // return >= the num_entry passed in. - virtual uint32_t CalculateSpace(const int num_entry) = 0; + // metadata. Passing the result to ApproximateNumEntries should + // (ideally, usually) return >= the num_entry passed in. + // When optimize_filters_for_memory is enabled, this function + // is not authoritative but represents a target size that should + // be close to the average size. + virtual size_t CalculateSpace(size_t num_entries) = 0; // Returns an estimate of the FP rate of the returned filter if - // `keys` keys are added and the filter returned by Finish is `bytes` - // bytes. - virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0; + // `num_entries` keys are added and the filter returned by Finish + // is `bytes` bytes. + virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0; }; // RocksDB built-in filter policy for Bloom or Bloom-like filters. @@ -64,10 +67,12 @@ class BloomFilterPolicy : public FilterPolicy { // FastLocalBloomImpl. // NOTE: TESTING ONLY as this mode does not check format_version kFastLocalBloom = 2, - // Automatically choose from the above (except kDeprecatedBlock) based on + // A Bloom alternative saving about 30% space for ~3-4x construction + // CPU time. See ribbon_alg.h and ribbon_impl.h. + kStandard128Ribbon = 3, + // Automatically choose between kLegacyBloom and kFastLocalBloom based on // context at build time, including compatibility with format_version. - // NOTE: This is currently the only recommended mode that is user exposed. - kAuto = 100, + kAutoBloom = 100, }; // All the different underlying implementations that a BloomFilterPolicy // might use, as a mode that says "always use this implementation." @@ -115,8 +120,12 @@ class BloomFilterPolicy : public FilterPolicy { int GetMillibitsPerKey() const { return millibits_per_key_; } // Essentially for testing only: legacy whole bits/key int GetWholeBitsPerKey() const { return whole_bits_per_key_; } + // Testing only + Mode GetMode() const { return mode_; } private: + // Bits per key settings are for configuring Bloom filters. + // Newer filters support fractional bits per key. For predictable behavior // of 0.001-precision values across floating point implementations, we // round to thousandths of a bit (on average) per key. @@ -127,6 +136,10 @@ class BloomFilterPolicy : public FilterPolicy { // behavior with format_version < 5 just in case.) int whole_bits_per_key_; + // For configuring Ribbon filter: a desired value for 1/fp_rate. For + // example, 100 -> 1% fp rate. + double desired_one_in_fp_rate_; + // Selected mode (a specific implementation or way of selecting an // implementation) for building new SST filters. Mode mode_; @@ -147,6 +160,9 @@ class BloomFilterPolicy : public FilterPolicy { // For newer Bloom filter implementation(s) FilterBitsReader* GetBloomBitsReader(const Slice& contents) const; + + // For Ribbon filter implementation(s) + FilterBitsReader* GetRibbonBitsReader(const Slice& contents) const; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index a104bec4749..0e336c37fb6 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -22,42 +22,63 @@ FullFilterBlockBuilder::FullFilterBlockBuilder( whole_key_filtering_(whole_key_filtering), last_whole_key_recorded_(false), last_prefix_recorded_(false), - num_added_(0) { + last_key_in_domain_(false), + any_added_(false) { assert(filter_bits_builder != nullptr); filter_bits_builder_.reset(filter_bits_builder); } -void FullFilterBlockBuilder::Add(const Slice& key) { - const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key); +size_t FullFilterBlockBuilder::EstimateEntriesAdded() { + return filter_bits_builder_->EstimateEntriesAdded(); +} + +void FullFilterBlockBuilder::Add(const Slice& key_without_ts) { + const bool add_prefix = + prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts); + + if (!last_prefix_recorded_ && last_key_in_domain_) { + // We can reach here when a new filter partition starts in partitioned + // filter. The last prefix in the previous partition should be added if + // necessary regardless of key_without_ts, to support prefix SeekForPrev. + AddKey(last_prefix_str_); + last_prefix_recorded_ = true; + } + if (whole_key_filtering_) { if (!add_prefix) { - AddKey(key); + AddKey(key_without_ts); } else { // if both whole_key and prefix are added to bloom then we will have whole - // key and prefix addition being interleaved and thus cannot rely on the - // bits builder to properly detect the duplicates by comparing with the - // last item. + // key_without_ts and prefix addition being interleaved and thus cannot + // rely on the bits builder to properly detect the duplicates by comparing + // with the last item. Slice last_whole_key = Slice(last_whole_key_str_); - if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) { - AddKey(key); + if (!last_whole_key_recorded_ || + last_whole_key.compare(key_without_ts) != 0) { + AddKey(key_without_ts); last_whole_key_recorded_ = true; - last_whole_key_str_.assign(key.data(), key.size()); + last_whole_key_str_.assign(key_without_ts.data(), + key_without_ts.size()); } } } if (add_prefix) { - AddPrefix(key); + last_key_in_domain_ = true; + AddPrefix(key_without_ts); + } else { + last_key_in_domain_ = false; } } // Add key to filter if needed inline void FullFilterBlockBuilder::AddKey(const Slice& key) { filter_bits_builder_->AddKey(key); - num_added_++; + any_added_ = true; } // Add prefix to filter if needed void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + assert(prefix_extractor_ && prefix_extractor_->InDomain(key)); Slice prefix = prefix_extractor_->Transform(key); if (whole_key_filtering_) { // if both whole_key and prefix are added to bloom then we will have whole @@ -85,8 +106,8 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, Reset(); // In this impl we ignore BlockHandle *status = Status::OK(); - if (num_added_ != 0) { - num_added_ = 0; + if (any_added_) { + any_added_ = false; return filter_bits_builder_->Finish(&filter_data_); } return Slice(); @@ -245,9 +266,9 @@ void FullFilterBlockReader::MayMatch( MultiGetRange filter_range(*range, range->begin(), range->end()); for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { if (!prefix_extractor) { - keys[num_keys++] = &iter->ukey; - } else if (prefix_extractor->InDomain(iter->ukey)) { - prefixes.emplace_back(prefix_extractor->Transform(iter->ukey)); + keys[num_keys++] = &iter->ukey_without_ts; + } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) { + prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts)); keys[num_keys++] = &prefixes.back(); } else { filter_range.SkipKey(iter); @@ -283,16 +304,16 @@ size_t FullFilterBlockReader::ApproximateMemoryUsage() const { } bool FullFilterBlockReader::RangeMayExist( - const Slice* iterate_upper_bound, const Slice& user_key, + const Slice* iterate_upper_bound, const Slice& user_key_without_ts, const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, bool need_upper_bound_check, bool no_io, BlockCacheLookupContext* lookup_context) { - if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) { *filter_checked = false; return true; } - Slice prefix = prefix_extractor->Transform(user_key); + Slice prefix = prefix_extractor->Transform(user_key_without_ts); if (need_upper_bound_check && !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { *filter_checked = false; @@ -318,7 +339,8 @@ bool FullFilterBlockReader::IsFilterCompatible( } Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); // first check if user_key and upper_bound all share the same prefix - if (!comparator->Equal(prefix, upper_bound_xform)) { + if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform, + false) != 0) { // second check if user_key's prefix is the immediate predecessor of // upper_bound and have the same length. If so, we know for sure all // keys in the range [user_key, upper_bound) share the same prefix. diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 42f4dbbc3a5..e5f6df659f0 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -50,8 +50,9 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { virtual bool IsBlockBased() override { return false; } virtual void StartBlock(uint64_t /*block_offset*/) override {} - virtual void Add(const Slice& key) override; - virtual size_t NumAdded() const override { return num_added_; } + virtual void Add(const Slice& key_without_ts) override; + virtual bool IsEmpty() const override { return !any_added_; } + virtual size_t EstimateEntriesAdded() override; virtual Slice Finish(const BlockHandle& tmp, Status* status) override; using FilterBlockBuilder::Finish; @@ -61,6 +62,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { virtual void Reset(); void AddPrefix(const Slice& key); const SliceTransform* prefix_extractor() { return prefix_extractor_; } + const std::string& last_prefix_str() const { return last_prefix_str_; } private: // important: all of these might point to invalid addresses @@ -72,10 +74,13 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { std::string last_whole_key_str_; bool last_prefix_recorded_; std::string last_prefix_str_; - - uint32_t num_added_; + // Whether prefix_extractor_->InDomain(last_whole_key_) is true. + // Used in partitioned filters so that the last prefix from the previous + // filter partition will be added to the current partition if + // last_key_in_domain_ is true, regardless of the current key. + bool last_key_in_domain_; + bool any_added_; std::unique_ptr filter_data_; - }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 496b149ab75..b3563da3ea3 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -3,13 +3,16 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "table/block_based/full_filter_block.h" + #include -#include "table/block_based/full_filter_block.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/status.h" #include "table/block_based/block_based_table_reader.h" -#include "table/block_based/mock_block_based_table.h" #include "table/block_based/filter_policy_internal.h" +#include "table/block_based/mock_block_based_table.h" +#include "table/format.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" @@ -224,8 +227,8 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { return rv; } - int CalculateNumEntry(const uint32_t bytes) override { - return b_->CalculateNumEntry(bytes); + size_t ApproximateNumEntries(size_t bytes) override { + return b_->ApproximateNumEntries(bytes); } size_t CountUnique() { return uniq_.size(); } @@ -239,11 +242,9 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) { const bool WHOLE_KEY = true; FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, bits_builder); - ASSERT_EQ(0, builder.NumAdded()); ASSERT_EQ(0, bits_builder->CountUnique()); // adds key and empty prefix; both abstractions count them builder.Add("key1"); - ASSERT_EQ(2, builder.NumAdded()); ASSERT_EQ(2, bits_builder->CountUnique()); // Add different key (unique) and also empty prefix (not unique). // From here in this test, it's immaterial whether the block builder @@ -262,7 +263,6 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) { const bool WHOLE_KEY = true; FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, bits_builder); - ASSERT_EQ(0, builder.NumAdded()); builder.Add(""); // test with empty key too builder.Add("prefix1key1"); builder.Add("prefix1key1"); @@ -275,14 +275,19 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) { TEST_F(FullFilterBlockTest, SingleChunk) { FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); - ASSERT_EQ(0, builder.NumAdded()); + ASSERT_TRUE(builder.IsEmpty()); builder.Add("foo"); + ASSERT_FALSE(builder.IsEmpty()); builder.Add("bar"); builder.Add("box"); builder.Add("box"); builder.Add("hello"); - ASSERT_EQ(5, builder.NumAdded()); - Slice slice = builder.Finish(); + // "box" only counts once + ASSERT_EQ(4, builder.EstimateEntriesAdded()); + ASSERT_FALSE(builder.IsEmpty()); + Status s; + Slice slice = builder.Finish(BlockHandle(), &s); + ASSERT_OK(s); CachableEntry block( new ParsedFullFilterBlock(table_options_.filter_policy.get(), diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index 7a281edbf08..4d8544161ce 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -66,7 +66,7 @@ Status HashIndexReader::Create(const BlockBasedTable* table, RandomAccessFileReader* const file = rep->file.get(); const Footer& footer = rep->footer; - const ImmutableCFOptions& ioptions = rep->ioptions; + const ImmutableOptions& ioptions = rep->ioptions; const PersistentCacheOptions& cache_options = rep->persistent_cache_options; MemoryAllocator* const memory_allocator = GetMemoryAllocator(rep->table_options); diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc index 76f894d59ff..275ae56dc19 100644 --- a/table/block_based/index_reader_common.cc +++ b/table/block_based/index_reader_common.cc @@ -26,7 +26,8 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->footer.index_handle(), UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, - get_context, lookup_context, /* for_compaction */ false, use_cache); + get_context, lookup_context, /* for_compaction */ false, use_cache, + /* wait_for_cache */ true); return s; } diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h index e1dcf0ebbfc..e0533a71753 100644 --- a/table/block_based/mock_block_based_table.h +++ b/table/block_based/mock_block_based_table.h @@ -23,7 +23,7 @@ class MockBlockBasedTableTester { public: Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; EnvOptions env_options_; BlockBasedTableOptions table_options_; InternalKeyComparator icomp_; @@ -47,7 +47,7 @@ class MockBlockBasedTableTester { context.column_family_name = "mock_cf"; context.compaction_style = ioptions_.compaction_style; context.level_at_creation = kMockLevel; - context.info_log = ioptions_.info_log; + context.info_log = ioptions_.logger; return BloomFilterPolicy::GetBuilderFromContext(context); } }; diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h index 36c619921d8..95d7b520871 100644 --- a/table/block_based/parsed_full_filter_block.h +++ b/table/block_based/parsed_full_filter_block.h @@ -32,6 +32,8 @@ class ParsedFullFilterBlock { bool own_bytes() const { return block_contents_.own_bytes(); } + const Slice GetBlockContentsData() const { return block_contents_.data; } + private: BlockContents block_contents_; std::unique_ptr filter_bits_reader_; diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index dc25abbea41..61cd1258781 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -7,7 +7,7 @@ #include -#include "file/file_util.h" +#include "file/random_access_file_reader.h" #include "monitoring/perf_context_imp.h" #include "port/malloc.h" #include "port/port.h" @@ -33,16 +33,18 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( true /*use_delta_encoding*/, use_value_delta_encoding), p_index_builder_(p_index_builder), - keys_added_to_partition_(0) { - keys_per_partition_ = - filter_bits_builder_->CalculateNumEntry(partition_size); + keys_added_to_partition_(0), + total_added_in_built_(0) { + keys_per_partition_ = static_cast( + filter_bits_builder_->ApproximateNumEntries(partition_size)); if (keys_per_partition_ < 1) { // partition_size (minus buffer, ~10%) might be smaller than minimum // filter size, sometimes based on cache line size. Try to find that // minimum size without CalculateSpace (not necessarily available). uint32_t larger = std::max(partition_size + 4, uint32_t{16}); for (;;) { - keys_per_partition_ = filter_bits_builder_->CalculateNumEntry(larger); + keys_per_partition_ = static_cast( + filter_bits_builder_->ApproximateNumEntries(larger)); if (keys_per_partition_ >= 1) { break; } @@ -72,15 +74,19 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( } filter_gc.push_back(std::unique_ptr(nullptr)); - // Add the prefix of the next key before finishing the partition. This hack, - // fixes a bug with format_verison=3 where seeking for the prefix would lead - // us to the previous partition. - const bool add_prefix = + // Add the prefix of the next key before finishing the partition without + // updating last_prefix_str_. This hack, fixes a bug with format_verison=3 + // where seeking for the prefix would lead us to the previous partition. + const bool maybe_add_prefix = next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); - if (add_prefix) { - FullFilterBlockBuilder::AddPrefix(*next_key); + if (maybe_add_prefix) { + const Slice next_key_prefix = prefix_extractor()->Transform(*next_key); + if (next_key_prefix.compare(last_prefix_str()) != 0) { + AddKey(next_key_prefix); + } } + total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded(); Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); std::string& index_key = p_index_builder_->GetPartitionKey(); filters.push_back({index_key, filter}); @@ -98,6 +104,10 @@ void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { keys_added_to_partition_++; } +size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() { + return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded(); +} + Slice PartitionedFilterBlockBuilder::Finish( const BlockHandle& last_partition_block_handle, Status* status) { if (finishing_filters == true) { @@ -127,6 +137,8 @@ Slice PartitionedFilterBlockBuilder::Finish( if (UNLIKELY(filters.empty())) { *status = Status::OK(); if (finishing_filters) { + // Simplest to just add them all at the end + total_added_in_built_ = 0; if (p_index_builder_->seperator_is_key_plus_seq()) { return index_on_filter_block_builder_.Finish(); } else { @@ -284,7 +296,8 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock( table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, UncompressionDict::GetEmptyDict(), filter_block, BlockType::kFilter, get_context, lookup_context, - /* for_compaction */ false, /* use_cache */ true); + /* for_compaction */ false, /* use_cache */ true, + /* wait_for_cache */ true); return s; } @@ -412,8 +425,8 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, + bool pin) { assert(table()); const BlockBasedTable::Rep* const rep = table()->get_rep(); @@ -426,12 +439,11 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, &lookup_context, &filter_block); if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Error retrieving top-level filter block while trying to " - "cache filter partitions: %s", - s.ToString().c_str()); - IGNORE_STATUS_IF_ERROR(s); - return; + ROCKS_LOG_ERROR(rep->ioptions.logger, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return s; } // Before read partitions, prefetch them to avoid lots of IOs @@ -457,14 +469,18 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; + rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer, + false /* Implicit autoreadahead */); - prefetch_buffer.reset(new FilePrefetchBuffer()); IOOptions opts; - s = PrepareIOFromReadOptions(ro, rep->file->env(), opts); + s = rep->file->PrepareIOOptions(ro, opts); if (s.ok()) { s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, static_cast(prefetch_len)); } + if (!s.ok()) { + return s; + } // After prefetch, read the partitions one by one for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { @@ -475,19 +491,22 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - &block, BlockType::kFilter, nullptr /* get_context */, &lookup_context, - nullptr /* contents */); - + /* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */, + &lookup_context, nullptr /* contents */); + if (!s.ok()) { + return s; + } assert(s.ok() || block.GetValue() == nullptr); - if (s.ok() && block.GetValue() != nullptr) { + + if (block.GetValue() != nullptr) { if (block.IsCached()) { if (pin) { filter_map_[handle.offset()] = std::move(block); } } } - IGNORE_STATUS_IF_ERROR(s); } + return biter.status(); } const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 2ccc8f8bcf5..0d0ee89e79e 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -8,6 +8,7 @@ #include #include #include + #include "db/dbformat.h" #include "index_builder.h" #include "rocksdb/options.h" @@ -33,6 +34,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { void AddKey(const Slice& key) override; void Add(const Slice& key) override; + size_t EstimateEntriesAdded() override; virtual Slice Finish(const BlockHandle& last_partition_block_handle, Status* status) override; @@ -62,6 +64,9 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { uint32_t keys_per_partition_; // The number of keys added to the last partition so far uint32_t keys_added_to_partition_; + // According to the bits builders, how many keys/prefixes added + // in all the filters we have fully built + uint64_t total_added_in_built_; BlockHandle last_encoded_handle_; }; @@ -130,7 +135,7 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { uint64_t block_offset, BlockHandle filter_handle, bool no_io, BlockCacheLookupContext* lookup_context, FilterManyFunction filter_function) const; - void CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin) override; const InternalKeyComparator* internal_comparator() const; bool index_key_includes_seq() const; diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index e23f910b537..7b4d49baf11 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -12,7 +12,6 @@ #include "table/block_based/filter_policy_internal.h" #include "index_builder.h" -#include "logging/logging.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" @@ -59,7 +58,7 @@ class PartitionedFilterBlockTest virtual public ::testing::WithParamInterface { public: Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; EnvOptions env_options_; BlockBasedTableOptions table_options_; InternalKeyComparator icomp_; diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index 0dbd132b16a..acb40f125d1 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/partitioned_index_reader.h" -#include "file/file_util.h" +#include "file/random_access_file_reader.h" #include "table/block_based/partitioned_index_iterator.h" namespace ROCKSDB_NAMESPACE { @@ -146,9 +146,10 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, uint64_t last_off = handle.offset() + block_size(handle); uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer); + rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer, + false /*Implicit auto readahead*/); IOOptions opts; - s = PrepareIOFromReadOptions(ro, rep->file->env(), opts); + s = rep->file->PrepareIOOptions(ro, opts); if (s.ok()) { s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, static_cast(prefetch_len)); @@ -166,8 +167,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context, - /*contents=*/nullptr); + /*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr, + &lookup_context, /*contents=*/nullptr); if (!s.ok()) { return s; diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc index db33e934067..dae5ddac2ac 100644 --- a/table/block_based/uncompression_dict_reader.cc +++ b/table/block_based/uncompression_dict_reader.cc @@ -60,11 +60,11 @@ Status UncompressionDictReader::ReadUncompressionDictionary( prefetch_buffer, read_options, rep->compression_dict_handle, UncompressionDict::GetEmptyDict(), uncompression_dict, BlockType::kCompressionDictionary, get_context, lookup_context, - /* for_compaction */ false, use_cache); + /* for_compaction */ false, use_cache, /* wait_for_cache */ true); if (!s.ok()) { ROCKS_LOG_WARN( - rep->ioptions.info_log, + rep->ioptions.logger, "Encountered error while reading data from compression dictionary " "block %s", s.ToString().c_str()); diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index b0880d516a2..90558168eeb 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -12,7 +12,6 @@ #include #include -#include "file/file_util.h" #include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" @@ -30,9 +29,9 @@ namespace ROCKSDB_NAMESPACE { inline void BlockFetcher::CheckBlockChecksum() { // Check the crc of the type and the block contents if (read_options_.verify_checksums) { - status_ = ROCKSDB_NAMESPACE::VerifyBlockChecksum( + io_status_ = status_to_io_status(ROCKSDB_NAMESPACE::VerifyBlockChecksum( footer_.checksum(), slice_.data(), block_size_, file_->file_name(), - handle_.offset()); + handle_.offset())); } } @@ -46,9 +45,9 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { return true; } else { // uncompressed page is not found - if (ioptions_.info_log && !status.IsNotFound()) { + if (ioptions_.logger && !status.IsNotFound()) { assert(!status.ok()); - ROCKS_LOG_INFO(ioptions_.info_log, + ROCKS_LOG_INFO(ioptions_.logger, "Error reading from persistent cache. %s", status.ToString().c_str()); } @@ -60,16 +59,19 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() { inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (prefetch_buffer_ != nullptr) { IOOptions opts; - Status s = PrepareIOFromReadOptions(read_options_, file_->env(), opts); - if (s.ok() && prefetch_buffer_->TryReadFromCache( - opts, handle_.offset(), block_size_with_trailer_, &slice_, - for_compaction_)) { + IOStatus io_s = file_->PrepareIOOptions(read_options_, opts); + if (io_s.ok() && prefetch_buffer_->TryReadFromCache( + opts, handle_.offset(), block_size_with_trailer_, + &slice_, &io_s, for_compaction_)) { CheckBlockChecksum(); - if (!status_.ok()) { + if (!io_status_.ok()) { return true; } got_from_prefetch_buffer_ = true; used_buf_ = const_cast(slice_.data()); + } else if (!io_s.ok()) { + io_status_ = io_s; + return true; } } return got_from_prefetch_buffer_; @@ -80,18 +82,18 @@ inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() { cache_options_.persistent_cache->IsCompressed()) { // lookup uncompressed cache mode p-cache std::unique_ptr raw_data; - status_ = PersistentCacheHelper::LookupRawPage( - cache_options_, handle_, &raw_data, block_size_with_trailer_); - if (status_.ok()) { + io_status_ = status_to_io_status(PersistentCacheHelper::LookupRawPage( + cache_options_, handle_, &raw_data, block_size_with_trailer_)); + if (io_status_.ok()) { heap_buf_ = CacheAllocationPtr(raw_data.release()); used_buf_ = heap_buf_.get(); slice_ = Slice(heap_buf_.get(), block_size_); return true; - } else if (!status_.IsNotFound() && ioptions_.info_log) { - assert(!status_.ok()); - ROCKS_LOG_INFO(ioptions_.info_log, + } else if (!io_status_.IsNotFound() && ioptions_.logger) { + assert(!io_status_.ok()); + ROCKS_LOG_INFO(ioptions_.logger, "Error reading from persistent cache. %s", - status_.ToString().c_str()); + io_status_.ToString().c_str()); } } return false; @@ -134,7 +136,7 @@ inline void BlockFetcher::PrepareBufferForBlockFromFile() { } inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { - if (status_.ok() && read_options_.fill_cache && + if (io_status_.ok() && read_options_.fill_cache && cache_options_.persistent_cache && cache_options_.persistent_cache->IsCompressed()) { // insert to raw cache @@ -144,8 +146,8 @@ inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() { } inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() { - if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache && - cache_options_.persistent_cache && + if (io_status_.ok() && !got_from_prefetch_buffer_ && + read_options_.fill_cache && cache_options_.persistent_cache && !cache_options_.persistent_cache->IsCompressed()) { // insert to uncompressed cache PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_, @@ -213,26 +215,26 @@ inline void BlockFetcher::GetBlockContents() { #endif } -Status BlockFetcher::ReadBlockContents() { +IOStatus BlockFetcher::ReadBlockContents() { if (TryGetUncompressBlockFromPersistentCache()) { compression_type_ = kNoCompression; #ifndef NDEBUG contents_->is_raw_block = true; #endif // NDEBUG - return Status::OK(); + return IOStatus::OK(); } if (TryGetFromPrefetchBuffer()) { - if (!status_.ok()) { - return status_; + if (!io_status_.ok()) { + return io_status_; } } else if (!TryGetCompressedBlockFromPersistentCache()) { IOOptions opts; - status_ = PrepareIOFromReadOptions(read_options_, file_->env(), opts); + io_status_ = file_->PrepareIOOptions(read_options_, opts); // Actual file read - if (status_.ok()) { + if (io_status_.ok()) { if (file_->use_direct_io()) { PERF_TIMER_GUARD(block_read_time); - status_ = + io_status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_, &slice_, nullptr, &direct_io_buf_, for_compaction_); PERF_COUNTER_ADD(block_read_count, 1); @@ -240,8 +242,9 @@ Status BlockFetcher::ReadBlockContents() { } else { PrepareBufferForBlockFromFile(); PERF_TIMER_GUARD(block_read_time); - status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_, - &slice_, used_buf_, nullptr, for_compaction_); + io_status_ = + file_->Read(opts, handle_.offset(), block_size_with_trailer_, + &slice_, used_buf_, nullptr, for_compaction_); PERF_COUNTER_ADD(block_read_count, 1); #ifndef NDEBUG if (slice_.data() == &stack_buf_[0]) { @@ -275,23 +278,23 @@ Status BlockFetcher::ReadBlockContents() { } PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_); - if (!status_.ok()) { - return status_; + if (!io_status_.ok()) { + return io_status_; } if (slice_.size() != block_size_with_trailer_) { - return Status::Corruption("truncated block read from " + - file_->file_name() + " offset " + - ToString(handle_.offset()) + ", expected " + - ToString(block_size_with_trailer_) + - " bytes, got " + ToString(slice_.size())); + return IOStatus::Corruption("truncated block read from " + + file_->file_name() + " offset " + + ToString(handle_.offset()) + ", expected " + + ToString(block_size_with_trailer_) + + " bytes, got " + ToString(slice_.size())); } CheckBlockChecksum(); - if (status_.ok()) { + if (io_status_.ok()) { InsertCompressedBlockToPersistentCacheIfNeeded(); } else { - return status_; + return io_status_; } } @@ -302,9 +305,9 @@ Status BlockFetcher::ReadBlockContents() { // compressed page, uncompress, update cache UncompressionContext context(compression_type_); UncompressionInfo info(context, uncompression_dict_, compression_type_); - status_ = UncompressBlockContents(info, slice_.data(), block_size_, - contents_, footer_.version(), ioptions_, - memory_allocator_); + io_status_ = status_to_io_status(UncompressBlockContents( + info, slice_.data(), block_size_, contents_, footer_.version(), + ioptions_, memory_allocator_)); #ifndef NDEBUG num_heap_buf_memcpy_++; #endif @@ -315,7 +318,7 @@ Status BlockFetcher::ReadBlockContents() { InsertUncompressedBlockToPersistentCacheIfNeeded(); - return status_; + return io_status_; } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_fetcher.h b/table/block_fetcher.h index c03352e986f..e06d964b52d 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -39,7 +39,7 @@ class BlockFetcher { BlockFetcher(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& read_options, const BlockHandle& handle, - BlockContents* contents, const ImmutableCFOptions& ioptions, + BlockContents* contents, const ImmutableOptions& ioptions, bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, @@ -64,7 +64,7 @@ class BlockFetcher { memory_allocator_compressed_(memory_allocator_compressed), for_compaction_(for_compaction) {} - Status ReadBlockContents(); + IOStatus ReadBlockContents(); CompressionType get_compression_type() const { return compression_type_; } #ifndef NDEBUG @@ -90,7 +90,7 @@ class BlockFetcher { const ReadOptions read_options_; const BlockHandle& handle_; BlockContents* contents_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const bool do_uncompress_; const bool maybe_compressed_; const BlockType block_type_; @@ -100,7 +100,7 @@ class BlockFetcher { const PersistentCacheOptions& cache_options_; MemoryAllocator* memory_allocator_; MemoryAllocator* memory_allocator_compressed_; - Status status_; + IOStatus io_status_; Slice slice_; char* used_buf_ = nullptr; AlignedBuf direct_io_buf_; diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 0786730af64..4499272b4fa 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -6,11 +6,11 @@ #include "table/block_fetcher.h" #include "db/table_properties_collector.h" -#include "env/composite_env_wrapper.h" #include "file/file_util.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/file_system.h" #include "table/block_based/binary_search_index_reader.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_factory.h" @@ -93,22 +93,23 @@ class BlockFetcherTest : public testing::Test { NewFileWriter(table_name, &writer); // Create table builder. - ImmutableCFOptions ioptions(options_); + ImmutableOptions ioptions(options_); InternalKeyComparator comparator(options_.comparator); ColumnFamilyOptions cf_options(options_); MutableCFOptions moptions(cf_options); - std::vector> factories; + IntTblPropCollectorFactories factories; std::unique_ptr table_builder(table_factory_.NewTableBuilder( TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, 0 /* sample_for_compression */, - CompressionOptions(), false /* skip_filters */, - kDefaultColumnFamilyName, -1 /* level */), - 0 /* column_family_id */, writer.get())); + compression_type, CompressionOptions(), + 0 /* column_family_id */, kDefaultColumnFamilyName, + -1 /* level */), + writer.get())); // Build table. for (int i = 0; i < 9; i++) { std::string key = ToInternalKey(std::to_string(i)); - std::string value = std::to_string(i); + // Append "00000000" to string value to enhance compression ratio + std::string value = "00000000" + std::to_string(i); table_builder->Add(key, value); } ASSERT_OK(table_builder->Finish()); @@ -190,22 +191,30 @@ class BlockFetcherTest : public testing::Test { ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy, expected_stats.memcpy_stats.num_compressed_buf_memcpy); - ASSERT_EQ(heap_buf_allocators[i].GetNumAllocations(), - expected_stats.buf_allocation_stats.num_heap_buf_allocations); - ASSERT_EQ( - compressed_buf_allocators[i].GetNumAllocations(), - expected_stats.buf_allocation_stats.num_compressed_buf_allocations); - - // The allocated buffers are not deallocated until - // the block content is deleted. - ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0); - ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0); - blocks[i].allocation.reset(); - ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), - expected_stats.buf_allocation_stats.num_heap_buf_allocations); - ASSERT_EQ( - compressed_buf_allocators[i].GetNumDeallocations(), - expected_stats.buf_allocation_stats.num_compressed_buf_allocations); + if (kXpressCompression == compression_type) { + // XPRESS allocates memory internally, thus does not support for + // custom allocator verification + continue; + } else { + ASSERT_EQ( + heap_buf_allocators[i].GetNumAllocations(), + expected_stats.buf_allocation_stats.num_heap_buf_allocations); + ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(), + expected_stats.buf_allocation_stats + .num_compressed_buf_allocations); + + // The allocated buffers are not deallocated until + // the block content is deleted. + ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0); + ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0); + blocks[i].allocation.reset(); + ASSERT_EQ( + heap_buf_allocators[i].GetNumDeallocations(), + expected_stats.buf_allocation_stats.num_heap_buf_allocations); + ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), + expected_stats.buf_allocation_stats + .num_compressed_buf_allocations); + } } } } @@ -248,11 +257,9 @@ class BlockFetcherTest : public testing::Test { void NewFileWriter(const std::string& filename, std::unique_ptr* writer) { std::string path = Path(filename); - EnvOptions env_options; - std::unique_ptr file; - ASSERT_OK(env_->NewWritableFile(path, &file, env_options)); - writer->reset(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), path, env_options)); + FileOptions file_options; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path, + file_options, writer, nullptr)); } void NewFileReader(const std::string& filename, const FileOptions& opt, @@ -260,10 +267,11 @@ class BlockFetcherTest : public testing::Test { std::string path = Path(filename); std::unique_ptr f; ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr)); - reader->reset(new RandomAccessFileReader(std::move(f), path, env_)); + reader->reset(new RandomAccessFileReader(std::move(f), path, + env_->GetSystemClock().get())); } - void NewTableReader(const ImmutableCFOptions& ioptions, + void NewTableReader(const ImmutableOptions& ioptions, const FileOptions& foptions, const InternalKeyComparator& comparator, const std::string& table_name, @@ -309,7 +317,7 @@ class BlockFetcherTest : public testing::Test { MemoryAllocator* compressed_buf_allocator, BlockContents* contents, MemcpyStats* stats, CompressionType* compresstion_type) { - ImmutableCFOptions ioptions(options_); + ImmutableOptions ioptions(options_); ReadOptions roptions; PersistentCacheOptions persistent_cache_options; Footer footer; @@ -340,7 +348,7 @@ class BlockFetcherTest : public testing::Test { MemoryAllocator* compressed_buf_allocator, BlockContents* block, std::string* result, MemcpyStats* memcpy_stats) { - ImmutableCFOptions ioptions(options_); + ImmutableOptions ioptions(options_); InternalKeyComparator comparator(options_.comparator); FileOptions foptions(options_); diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc index f42e87bdfb1..15f21403567 100644 --- a/table/cuckoo/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -82,6 +82,8 @@ CuckooTableBuilder::CuckooTableBuilder( properties_.column_family_name = column_family_name; properties_.db_id = db_id; properties_.db_session_id = db_session_id; + status_.PermitUncheckedError(); + io_status_.PermitUncheckedError(); } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { @@ -90,8 +92,11 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { return; } ParsedInternalKey ikey; - if (ParseInternalKey(key, &ikey) != Status::OK()) { - status_ = Status::Corruption("Unable to parse key into inernal key."); + Status pik_status = + ParseInternalKey(key, &ikey, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + status_ = Status::Corruption("Unable to parse key into internal key. ", + pik_status.getState()); return; } if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { @@ -247,7 +252,6 @@ Status CuckooTableBuilder::Finish() { assert(!closed_); closed_ = true; std::vector buckets; - Status s; std::string unused_bucket; if (num_entries_ > 0) { // Calculate the real hash size if module hash is enabled. diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index 322dbf0e4af..a86b6fb18e9 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -5,14 +5,16 @@ #ifndef ROCKSDB_LITE -#include -#include +#include "table/cuckoo/cuckoo_table_builder.h" + #include +#include #include +#include #include "file/random_access_file_reader.h" #include "file/writable_file_writer.h" -#include "table/cuckoo/cuckoo_table_builder.h" +#include "rocksdb/file_system.h" #include "table/meta_blocks.h" #include "test_util/testharness.h" #include "test_util/testutil.h" @@ -35,7 +37,7 @@ class CuckooBuilderTest : public testing::Test { env_ = Env::Default(); Options options; options.allow_mmap_reads = true; - env_options_ = EnvOptions(options); + file_options_ = FileOptions(options); } void CheckFileContents(const std::vector& keys, @@ -47,26 +49,25 @@ class CuckooBuilderTest : public testing::Test { uint64_t num_deletions = 0; for (const auto& key : keys) { ParsedInternalKey parsed; - if (ParseInternalKey(key, &parsed) == Status::OK() && - parsed.type == kTypeDeletion) { + Status pik_status = + ParseInternalKey(key, &parsed, true /* log_err_key */); + if (pik_status.ok() && parsed.type == kTypeDeletion) { num_deletions++; } } // Read file - std::unique_ptr read_file; - ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_)); uint64_t read_file_size; ASSERT_OK(env_->GetFileSize(fname, &read_file_size)); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr)); Options options; options.allow_mmap_reads = true; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); // Assert Table Properties. TableProperties* props = nullptr; - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, kCuckooTableMagicNumber, ioptions, &props, true /* compression_type_missing */)); @@ -157,7 +158,7 @@ class CuckooBuilderTest : public testing::Test { Env* env_; - EnvOptions env_options_; + FileOptions file_options_; std::string fname; const double kHashTableRatio = 0.9; }; @@ -165,10 +166,9 @@ class CuckooBuilderTest : public testing::Test { TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { std::unique_ptr writable_file; fname = test::PerThreadDBPath("EmptyFile"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -206,12 +206,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; fname = test::PerThreadDBPath("NoCollisionFullKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -256,12 +254,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; fname = test::PerThreadDBPath("WithCollisionFullKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -305,13 +301,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; uint32_t cuckoo_block_size = 2; fname = test::PerThreadDBPath("WithCollisionFullKey2"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder( file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash, @@ -360,12 +354,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathFullKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -411,12 +403,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { } uint64_t expected_table_size = GetExpectedTableSize(keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash, 0 /* column_family_id */, @@ -455,12 +445,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { std::vector expected_locations = {0, 1, 2, 3}; uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("NoCollisionUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -500,12 +489,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { std::vector expected_locations = {0, 1, 2, 3}; uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -547,12 +535,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { std::vector expected_locations = {0, 1, 3, 4, 2}; uint64_t expected_table_size = GetExpectedTableSize(user_keys.size()); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -593,12 +580,10 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { }; hash_map = std::move(hm); - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("WithCollisionPathUserKey"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, @@ -622,12 +607,10 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { uint32_t num_hash_fun = 4; std::string user_key = "repeatedkey"; - std::unique_ptr writable_file; + std::unique_ptr file_writer; fname = test::PerThreadDBPath("FailWhenSameKeyInserted"); - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - EnvOptions())); + ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname, + file_options_, &file_writer, nullptr)); CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash, 0 /* column_family_id */, diff --git a/table/cuckoo/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc index c6d3c377ce5..4fd014e97f6 100644 --- a/table/cuckoo/cuckoo_table_factory.cc +++ b/table/cuckoo/cuckoo_table_factory.cc @@ -30,11 +30,8 @@ Status CuckooTableFactory::NewTableReader( } TableBuilder* CuckooTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { - // Ignore the skipFIlters flag. Does not apply to this file format - // - // TODO: change builder to take the option struct return new CuckooTableBuilder( file, table_options_.hash_table_ratio, 64, @@ -42,8 +39,9 @@ TableBuilder* CuckooTableFactory::NewTableBuilder( table_builder_options.internal_comparator.user_comparator(), table_options_.cuckoo_block_size, table_options_.use_module_hash, table_options_.identity_as_first_hash, nullptr /* get_slice_hash */, - column_family_id, table_builder_options.column_family_name, - table_builder_options.db_id, table_builder_options.db_session_id); + table_builder_options.column_family_id, + table_builder_options.column_family_name, table_builder_options.db_id, + table_builder_options.db_session_id); } std::string CuckooTableFactory::GetPrintableOptions() const { @@ -95,8 +93,7 @@ static std::unordered_map cuckoo_table_type_info = CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options) : table_options_(table_options) { - ConfigurableHelper::RegisterOptions(*this, &table_options_, - &cuckoo_table_type_info); + RegisterOptions(&table_options_, &cuckoo_table_type_info); } TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { diff --git a/table/cuckoo/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h index 30d4155e1fa..a51f23e532a 100644 --- a/table/cuckoo/cuckoo_table_factory.h +++ b/table/cuckoo/cuckoo_table_factory.h @@ -56,6 +56,8 @@ class CuckooTableFactory : public TableFactory { const CuckooTableOptions& table_option = CuckooTableOptions()); ~CuckooTableFactory() {} + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kCuckooTableName(); } const char* Name() const override { return kCuckooTableName(); } using TableFactory::NewTableReader; @@ -67,7 +69,7 @@ class CuckooTableFactory : public TableFactory { TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; std::string GetPrintableOptions() const override; diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 275649ea838..4045d45287d 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -15,7 +15,9 @@ #include #include #include + #include "memory/arena.h" +#include "options/cf_options.h" #include "rocksdb/iterator.h" #include "rocksdb/table.h" #include "table/cuckoo/cuckoo_table_factory.h" @@ -33,7 +35,7 @@ const uint32_t kInvalidIndex = std::numeric_limits::max(); extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) @@ -172,7 +174,8 @@ Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, } else { Slice full_key(bucket, key_length_); ParsedInternalKey found_ikey; - Status s = ParseInternalKey(full_key, &found_ikey); + Status s = ParseInternalKey(full_key, &found_ikey, + false /* log_err_key */); // TODO if (!s.ok()) return s; bool dont_care __attribute__((__unused__)); get_context->SaveValue(found_ikey, value, &dont_care); diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index 5a7c8b72db4..43afd4fd7e8 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -16,7 +16,6 @@ #include "db/dbformat.h" #include "file/random_access_file_reader.h" -#include "options/cf_options.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "table/table_reader.h" @@ -25,10 +24,11 @@ namespace ROCKSDB_NAMESPACE { class Arena; class TableReader; +struct ImmutableOptions; class CuckooTableReader: public TableReader { public: - CuckooTableReader(const ImmutableCFOptions& ioptions, + CuckooTableReader(const ImmutableOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* user_comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index 5795de80d38..1914d26f13b 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -31,7 +31,6 @@ int main() { #include "util/string_util.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; -using GFLAGS_NAMESPACE::SetUsageMessage; DEFINE_string(file_dir, "", "Directory where the files will be created" " for benchmark. Added for using tmpfs."); @@ -69,7 +68,7 @@ class CuckooReaderTest : public testing::Test { CuckooReaderTest() { options.allow_mmap_reads = true; env = options.env; - env_options = EnvOptions(options); + file_options = FileOptions(options); } void SetUp(int num) { @@ -89,12 +88,9 @@ class CuckooReaderTest : public testing::Test { void CreateCuckooFileAndCheckReader( const Comparator* ucomp = BytewiseComparator()) { - std::unique_ptr writable_file; - ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - env_options)); - + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname, + file_options, &file_writer, nullptr)); CuckooTableBuilder builder( file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false, GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName); @@ -110,12 +106,10 @@ class CuckooReaderTest : public testing::Test { ASSERT_OK(file_writer->Close()); // Check reader now. - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - const ImmutableCFOptions ioptions(options); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env->GetFileSystem(), fname, file_options, &file_reader, nullptr)); + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); @@ -140,12 +134,10 @@ class CuckooReaderTest : public testing::Test { } void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - const ImmutableCFOptions ioptions(options); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env->GetFileSystem(), fname, file_options, &file_reader, nullptr)); + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); @@ -212,12 +204,12 @@ class CuckooReaderTest : public testing::Test { uint64_t file_size; Options options; Env* env; - EnvOptions env_options; + FileOptions file_options; }; TEST_F(CuckooReaderTest, FileNotMmaped) { options.allow_mmap_reads = false; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr); ASSERT_TRUE(reader.status().IsInvalidArgument()); ASSERT_STREQ("File is not mmaped", reader.status().getState()); @@ -331,12 +323,12 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { } auto* ucmp = BytewiseComparator(); CreateCuckooFileAndCheckReader(); - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - const ImmutableCFOptions ioptions(options); + + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create( + env->GetFileSystem(), fname, file_options, &file_reader, nullptr)); + + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp, GetSliceHash); ASSERT_OK(reader.status()); @@ -416,15 +408,13 @@ void WriteFile(const std::vector& keys, const uint64_t num, double hash_ratio) { Options options; options.allow_mmap_reads = true; - Env* env = options.env; - EnvOptions env_options = EnvOptions(options); + const auto& fs = options.env->GetFileSystem(); + FileOptions file_options(options); std::string fname = GetFileName(num); - std::unique_ptr writable_file; - ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - std::unique_ptr file_writer(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(writable_file)), fname, - env_options)); + std::unique_ptr file_writer; + ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer, + nullptr)); CuckooTableBuilder builder( file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5, false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */, @@ -441,14 +431,13 @@ void WriteFile(const std::vector& keys, ASSERT_OK(file_writer->Close()); uint64_t file_size; - env->GetFileSize(fname, &file_size); - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - - const ImmutableCFOptions ioptions(options); + ASSERT_OK( + fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr)); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options, + &file_reader, nullptr)); + + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); @@ -470,18 +459,18 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { Options options; options.allow_mmap_reads = true; Env* env = options.env; - EnvOptions env_options = EnvOptions(options); + const auto& fs = options.env->GetFileSystem(); + FileOptions file_options(options); std::string fname = GetFileName(num); uint64_t file_size; - env->GetFileSize(fname, &file_size); - std::unique_ptr read_file; - ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); - std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file), - fname)); - - const ImmutableCFOptions ioptions(options); + ASSERT_OK( + fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr)); + std::unique_ptr file_reader; + ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options, + &file_reader, nullptr)); + + const ImmutableOptions ioptions(options); CuckooTableReader reader(ioptions, std::move(file_reader), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); diff --git a/table/format.cc b/table/format.cc index 23dc0bbc18c..5e0307b599e 100644 --- a/table/format.cc +++ b/table/format.cc @@ -14,11 +14,11 @@ #include "block_fetcher.h" #include "file/random_access_file_reader.h" -#include "logging/logging.h" #include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" #include "table/persistent_cache_helper.h" @@ -41,6 +41,7 @@ extern const uint64_t kPlainTableMagicNumber; const uint64_t kLegacyPlainTableMagicNumber = 0; const uint64_t kPlainTableMagicNumber = 0; #endif +const char* kHostnameForDbHostId = "__hostname__"; bool ShouldReportDetailedTime(Env* env, Statistics* stats) { return env != nullptr && stats != nullptr && @@ -306,8 +307,9 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, // for iterator, TryReadFromCache might do a readahead. Revisit to see if we // need to pass a timeout at that point if (prefetch_buffer == nullptr || - !prefetch_buffer->TryReadFromCache( - IOOptions(), read_offset, Footer::kMaxEncodedLength, &footer_input)) { + !prefetch_buffer->TryReadFromCache(IOOptions(), read_offset, + Footer::kMaxEncodedLength, + &footer_input, nullptr)) { if (file->use_direct_io()) { s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, &footer_input, nullptr, &internal_buf); @@ -345,14 +347,14 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, Status UncompressBlockContentsForCompressionType( const UncompressionInfo& uncompression_info, const char* data, size_t n, BlockContents* contents, uint32_t format_version, - const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) { + const ImmutableOptions& ioptions, MemoryAllocator* allocator) { Status ret = Status::OK(); assert(uncompression_info.type() != kNoCompression && "Invalid compression type"); - StopWatchNano timer(ioptions.env, ShouldReportDetailedTime( - ioptions.env, ioptions.statistics)); + StopWatchNano timer(ioptions.clock, + ShouldReportDetailedTime(ioptions.env, ioptions.stats)); size_t uncompressed_size = 0; CacheAllocationPtr ubuf = UncompressData(uncompression_info, data, n, &uncompressed_size, @@ -365,13 +367,13 @@ Status UncompressBlockContentsForCompressionType( *contents = BlockContents(std::move(ubuf), uncompressed_size); - if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) { - RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS, + if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) { + RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS, timer.ElapsedNanos()); } - RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED, + RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED, contents->data.size()); - RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED); + RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED); TEST_SYNC_POINT_CALLBACK( "UncompressBlockContentsForCompressionType:TamperWithReturnValue", @@ -394,7 +396,7 @@ Status UncompressBlockContentsForCompressionType( Status UncompressBlockContents(const UncompressionInfo& uncompression_info, const char* data, size_t n, BlockContents* contents, uint32_t format_version, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, MemoryAllocator* allocator) { assert(data[n] != kNoCompression); assert(data[n] == static_cast(uncompression_info.type())); @@ -403,4 +405,18 @@ Status UncompressBlockContents(const UncompressionInfo& uncompression_info, ioptions, allocator); } +// Replace the contents of db_host_id with the actual hostname, if db_host_id +// matches the keyword kHostnameForDbHostId +Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) { + assert(db_host_id); + if (*db_host_id == kHostnameForDbHostId) { + Status s = env->GetHostNameString(db_host_id); + if (!s.ok()) { + db_host_id->clear(); + } + return s; + } + + return Status::OK(); +} } // namespace ROCKSDB_NAMESPACE diff --git a/table/format.h b/table/format.h index e40a5ceaeac..7d7962a4d54 100644 --- a/table/format.h +++ b/table/format.h @@ -305,7 +305,7 @@ struct BlockContents { extern Status ReadBlockContents( RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - BlockContents* contents, const ImmutableCFOptions& ioptions, + BlockContents* contents, const ImmutableOptions& ioptions, bool do_uncompress = true, const Slice& compression_dict = Slice(), const PersistentCacheOptions& cache_options = PersistentCacheOptions()); @@ -320,7 +320,7 @@ extern Status UncompressBlockContents(const UncompressionInfo& info, const char* data, size_t n, BlockContents* contents, uint32_t compress_format_version, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr); // This is an extension to UncompressBlockContents that accepts @@ -329,7 +329,10 @@ extern Status UncompressBlockContents(const UncompressionInfo& info, extern Status UncompressBlockContentsForCompressionType( const UncompressionInfo& info, const char* data, size_t n, BlockContents* contents, uint32_t compress_format_version, - const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr); + const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr); + +// Replace db_host_id contents with the real hostname if necessary +extern Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id); // Implementation details follow. Clients should ignore, diff --git a/table/get_context.cc b/table/get_context.cc index ecd59220a73..919ed4c3429 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -4,15 +4,16 @@ // (found in the LICENSE.Apache file in the root directory). #include "table/get_context.h" + #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/read_callback.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" -#include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/statistics.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { @@ -38,14 +39,17 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { } // namespace -GetContext::GetContext( - const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, - Statistics* statistics, GetState init_state, const Slice& user_key, - PinnableSlice* pinnable_val, std::string* timestamp, bool* value_found, - MergeContext* merge_context, bool do_merge, - SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq, - PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback, - bool* is_blob_index, uint64_t tracing_get_id) +GetContext::GetContext(const Comparator* ucmp, + const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, + const Slice& user_key, PinnableSlice* pinnable_val, + std::string* timestamp, bool* value_found, + MergeContext* merge_context, bool do_merge, + SequenceNumber* _max_covering_tombstone_seq, + SystemClock* clock, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, + ReadCallback* callback, bool* is_blob_index, + uint64_t tracing_get_id, BlobFetcher* blob_fetcher) : ucmp_(ucmp), merge_operator_(merge_operator), logger_(logger), @@ -57,14 +61,15 @@ GetContext::GetContext( value_found_(value_found), merge_context_(merge_context), max_covering_tombstone_seq_(_max_covering_tombstone_seq), - env_(env), + clock_(clock), seq_(seq), replay_log_(nullptr), pinned_iters_mgr_(_pinned_iters_mgr), callback_(callback), do_merge_(do_merge), is_blob_index_(is_blob_index), - tracing_get_id_(tracing_get_id) { + tracing_get_id_(tracing_get_id), + blob_fetcher_(blob_fetcher) { if (seq_) { *seq_ = kMaxSequenceNumber; } @@ -75,13 +80,14 @@ GetContext::GetContext( const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context, - bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env, - SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr, - ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id) + bool do_merge, SequenceNumber* _max_covering_tombstone_seq, + SystemClock* clock, SequenceNumber* seq, + PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback, + bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher) : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key, pinnable_val, nullptr, value_found, merge_context, do_merge, - _max_covering_tombstone_seq, env, seq, _pinned_iters_mgr, - callback, is_blob_index, tracing_get_id) {} + _max_covering_tombstone_seq, clock, seq, _pinned_iters_mgr, + callback, is_blob_index, tracing_get_id, blob_fetcher) {} // Called from TableCache::Get and Table::Get when file/block in which // key may exist are not there in TableCache/BlockCache respectively. In this @@ -216,7 +222,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(matched); assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); - if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) { + if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) { *matched = true; // If the value is not in the snapshot, skip it if (!CheckCallback(parsed_key.sequence)) { @@ -245,9 +251,12 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(state_ == kNotFound || state_ == kMerge); if (type == kTypeBlobIndex && is_blob_index_ == nullptr) { // Blob value not supported. Stop. - state_ = kBlobIndex; + state_ = kUnexpectedBlobIndex; return false; } + if (is_blob_index_ != nullptr) { + *is_blob_index_ = (type == kTypeBlobIndex); + } if (kNotFound == state_) { state_ = kFound; if (do_merge_) { @@ -258,7 +267,6 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } else { TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this); - // Otherwise copy the value pinnable_val_->PinSelf(value); } @@ -267,27 +275,44 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of // merge_context_->operand_list - push_operand(value, value_pinner); + if (is_blob_index_ != nullptr && *is_blob_index_) { + PinnableSlice pin_val; + if (GetBlobValue(value, &pin_val) == false) { + return false; + } + Slice blob_value(pin_val); + push_operand(blob_value, nullptr); + } else { + push_operand(value, value_pinner); + } } } else if (kMerge == state_) { assert(merge_operator_ != nullptr); - state_ = kFound; - if (do_merge_) { - if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, &value, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; - } + if (is_blob_index_ != nullptr && *is_blob_index_) { + PinnableSlice pin_val; + if (GetBlobValue(value, &pin_val) == false) { + return false; + } + Slice blob_value(pin_val); + state_ = kFound; + if (do_merge_) { + Merge(&blob_value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(blob_value, nullptr); } } else { - // It means this function is called as part of DB GetMergeOperands - // API and the current value should be part of - // merge_context_->operand_list - push_operand(value, value_pinner); + state_ = kFound; + if (do_merge_) { + Merge(&value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); + } } } if (state_ == kFound) { @@ -297,9 +322,6 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, timestamp_->assign(ts.data(), ts.size()); } } - if (is_blob_index_ != nullptr) { - *is_blob_index_ = (type == kTypeBlobIndex); - } return false; case kTypeDeletion: @@ -313,20 +335,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kDeleted; } else if (kMerge == state_) { state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - if (do_merge_) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; - } - } - // If do_merge_ = false then the current value shouldn't be part of - // merge_context_->operand_list - } + Merge(nullptr); + // If do_merge_ = false then the current value shouldn't be part of + // merge_context_->operand_list } return false; @@ -339,20 +350,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, merge_operator_->ShouldMerge( merge_context_->GetOperandsDirectionBackward())) { state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - // do_merge_ = true this is the case where this function is called - // as part of DB Get API hence merge operators should be merged. - if (do_merge_) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; - } - } - } + Merge(nullptr); return false; } return true; @@ -367,6 +365,35 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, return false; } +void GetContext::Merge(const Slice* value) { + if (LIKELY(pinnable_val_ != nullptr)) { + if (do_merge_) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, value, merge_context_->GetOperands(), + pinnable_val_->GetSelf(), logger_, statistics_, clock_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } + } + } +} + +bool GetContext::GetBlobValue(const Slice& blob_index, + PinnableSlice* blob_value) { + Status status = blob_fetcher_->FetchBlob(user_key_, blob_index, blob_value); + if (!status.ok()) { + if (status.IsIncomplete()) { + MarkKeyMayExist(); + return false; + } + state_ = kCorrupt; + return false; + } + *is_blob_index_ = false; + return true; +} + void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && value_pinner != nullptr) { diff --git a/table/get_context.h b/table/get_context.h index c349a3e6ff9..9b2f678078d 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -5,10 +5,11 @@ #pragma once #include + +#include "db/blob/blob_fetcher.h" #include "db/dbformat.h" #include "db/merge_context.h" #include "db/read_callback.h" -#include "rocksdb/env.h" #include "rocksdb/statistics.h" #include "rocksdb/types.h" #include "table/block_based/block.h" @@ -16,6 +17,7 @@ namespace ROCKSDB_NAMESPACE { class MergeContext; class PinnedIteratorsManager; +class SystemClock; // Data structure for accumulating statistics during a point lookup. At the // end of the point lookup, the corresponding ticker stats are updated. This @@ -71,7 +73,7 @@ class GetContext { kDeleted, kCorrupt, kMerge, // saver contains the current merge result (the operands) - kBlobIndex, + kUnexpectedBlobIndex, }; GetContextStats get_context_stats_; @@ -97,23 +99,23 @@ class GetContext { // merge_context and they are never merged. The value pointer is untouched. GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, - const Slice& user_key, PinnableSlice* value, - bool* value_found, MergeContext* merge_context, bool do_merge, - SequenceNumber* max_covering_tombstone_seq, Env* env, + const Slice& user_key, PinnableSlice* value, bool* value_found, + MergeContext* merge_context, bool do_merge, + SequenceNumber* max_covering_tombstone_seq, SystemClock* clock, SequenceNumber* seq = nullptr, PinnedIteratorsManager* _pinned_iters_mgr = nullptr, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, - uint64_t tracing_get_id = 0); + uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr); GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* value, std::string* timestamp, bool* value_found, MergeContext* merge_context, bool do_merge, - SequenceNumber* max_covering_tombstone_seq, Env* env, + SequenceNumber* max_covering_tombstone_seq, SystemClock* clock, SequenceNumber* seq = nullptr, PinnedIteratorsManager* _pinned_iters_mgr = nullptr, ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, - uint64_t tracing_get_id = 0); + uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr); GetContext() = delete; @@ -170,6 +172,9 @@ class GetContext { void push_operand(const Slice& value, Cleanable* value_pinner); private: + void Merge(const Slice* value); + bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value); + const Comparator* ucmp_; const MergeOperator* merge_operator_; // the merge operations encountered; @@ -183,7 +188,7 @@ class GetContext { bool* value_found_; // Is value set correctly? Used by KeyMayExist MergeContext* merge_context_; SequenceNumber* max_covering_tombstone_seq_; - Env* env_; + SystemClock* clock_; // If a key is found, seq_ will be set to the SequenceNumber of most recent // write to the key or kMaxSequenceNumber if unknown SequenceNumber* seq_; @@ -200,6 +205,7 @@ class GetContext { // Used for block cache tracing only. A tracing get id uniquely identifies a // Get or a MultiGet. const uint64_t tracing_get_id_; + BlobFetcher* blob_fetcher_; }; // Call this to replay a log and bring the get_context up to date. The replay diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 6b781de1ea9..52e56be81bf 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -83,6 +83,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kIndexValueIsDeltaEncoded, props.index_value_is_delta_encoded); Add(TablePropertiesNames::kNumEntries, props.num_entries); + Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries); Add(TablePropertiesNames::kDeletedKeys, props.num_deletions); Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands); Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions); @@ -96,12 +97,23 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { if (props.file_creation_time > 0) { Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time); } + if (props.slow_compression_estimated_data_size > 0) { + Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize, + props.slow_compression_estimated_data_size); + } + if (props.fast_compression_estimated_data_size > 0) { + Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, + props.fast_compression_estimated_data_size); + } if (!props.db_id.empty()) { Add(TablePropertiesNames::kDbId, props.db_id); } if (!props.db_session_id.empty()) { Add(TablePropertiesNames::kDbSessionId, props.db_session_id); } + if (!props.db_host_id.empty()) { + Add(TablePropertiesNames::kDbHostId, props.db_host_id); + } if (!props.filter_policy_name.empty()) { Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name); @@ -141,8 +153,8 @@ Slice PropertyBlockBuilder::Finish() { return properties_block_->Finish(); } -void LogPropertiesCollectionError( - Logger* info_log, const std::string& method, const std::string& name) { +void LogPropertiesCollectionError(Logger* info_log, const std::string& method, + const std::string& name) { assert(method == "Add" || method == "Finish"); std::string msg = @@ -169,11 +181,11 @@ bool NotifyCollectTableCollectorsOnAdd( void NotifyCollectTableCollectorsOnBlockAdd( const std::vector>& collectors, - const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast, - const uint64_t blockCompressedBytesSlow) { + const uint64_t block_raw_bytes, const uint64_t block_compressed_bytes_fast, + const uint64_t block_compressed_bytes_slow) { for (auto& collector : collectors) { - collector->BlockAdd(blockRawBytes, blockCompressedBytesFast, - blockCompressedBytesSlow); + collector->BlockAdd(block_raw_bytes, block_compressed_bytes_fast, + block_compressed_bytes_slow); } } @@ -200,7 +212,7 @@ bool NotifyCollectTableCollectorsOnFinish( Status ReadProperties(const ReadOptions& read_options, const Slice& handle_value, RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, TableProperties** table_properties, bool verify_checksum, BlockHandle* ret_block_handle, CacheAllocationPtr* verification_buf, @@ -258,6 +270,8 @@ Status ReadProperties(const ReadOptions& read_options, {TablePropertiesNames::kNumDataBlocks, &new_table_properties->num_data_blocks}, {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries}, + {TablePropertiesNames::kNumFilterEntries, + &new_table_properties->num_filter_entries}, {TablePropertiesNames::kDeletedKeys, &new_table_properties->num_deletions}, {TablePropertiesNames::kMergeOperands, @@ -276,6 +290,10 @@ Status ReadProperties(const ReadOptions& read_options, &new_table_properties->oldest_key_time}, {TablePropertiesNames::kFileCreationTime, &new_table_properties->file_creation_time}, + {TablePropertiesNames::kSlowCompressionEstimatedDataSize, + &new_table_properties->slow_compression_estimated_data_size}, + {TablePropertiesNames::kFastCompressionEstimatedDataSize, + &new_table_properties->fast_compression_estimated_data_size}, }; std::string last_key; @@ -314,7 +332,7 @@ Status ReadProperties(const ReadOptions& read_options, auto error_msg = "Detect malformed value in properties meta-block:" "\tkey: " + key + "\tval: " + raw_val.ToString(); - ROCKS_LOG_ERROR(ioptions.info_log, "%s", error_msg.c_str()); + ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str()); continue; } *(pos->second) = val; @@ -322,6 +340,8 @@ Status ReadProperties(const ReadOptions& read_options, new_table_properties->db_id = raw_val.ToString(); } else if (key == TablePropertiesNames::kDbSessionId) { new_table_properties->db_session_id = raw_val.ToString(); + } else if (key == TablePropertiesNames::kDbHostId) { + new_table_properties->db_host_id = raw_val.ToString(); } else if (key == TablePropertiesNames::kFilterPolicy) { new_table_properties->filter_policy_name = raw_val.ToString(); } else if (key == TablePropertiesNames::kColumnFamilyName) { @@ -366,7 +386,7 @@ Status ReadProperties(const ReadOptions& read_options, Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, TableProperties** properties, bool compression_type_missing, MemoryAllocator* memory_allocator, @@ -437,7 +457,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const std::string& meta_block_name, BlockHandle* block_handle, bool /*compression_type_missing*/, @@ -479,7 +499,7 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, bool /*compression_type_missing*/, MemoryAllocator* memory_allocator) { diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 319b2c7127e..01b56d57c0c 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -70,8 +70,8 @@ class PropertyBlockBuilder { // Were we encounter any error occurs during user-defined statistics collection, // we'll write the warning message to info log. -void LogPropertiesCollectionError( - Logger* info_log, const std::string& method, const std::string& name); +void LogPropertiesCollectionError(Logger* info_log, const std::string& method, + const std::string& name); // Utility functions help table builder to trigger batch events for user // defined property collectors. @@ -86,8 +86,8 @@ bool NotifyCollectTableCollectorsOnAdd( void NotifyCollectTableCollectorsOnBlockAdd( const std::vector>& collectors, - uint64_t blockRawBytes, uint64_t blockCompressedBytesFast, - uint64_t blockCompressedBytesSlow); + uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast, + uint64_t block_compressed_bytes_slow); // NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all // property collectors. The collected properties will be added to `builder`. @@ -102,7 +102,7 @@ bool NotifyCollectTableCollectorsOnFinish( Status ReadProperties(const ReadOptions& ro, const Slice& handle_value, RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, TableProperties** table_properties, bool verify_checksum, BlockHandle* block_handle, CacheAllocationPtr* verification_buf, @@ -119,7 +119,7 @@ Status ReadProperties(const ReadOptions& ro, const Slice& handle_value, // `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock` Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, TableProperties** properties, bool compression_type_missing = false, MemoryAllocator* memory_allocator = nullptr, @@ -133,7 +133,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, // Find the meta block Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const std::string& meta_block_name, BlockHandle* block_handle, bool compression_type_missing = false, @@ -145,7 +145,7 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, bool compression_type_missing = false, diff --git a/table/mock_table.cc b/table/mock_table.cc index 757fdb963a5..cc3cff97331 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -18,8 +18,8 @@ namespace mock { KVVector MakeMockFile(std::initializer_list l) { return KVVector(l); } -void SortKVVector(KVVector* kv_vector) { - InternalKeyComparator icmp(BytewiseComparator()); +void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) { + InternalKeyComparator icmp(ucmp); std::sort(kv_vector->begin(), kv_vector->end(), [icmp](KVPair a, KVPair b) -> bool { return icmp.Compare(a.first, b.first) < 0; @@ -207,8 +207,10 @@ Status MockTableReader::Get(const ReadOptions&, const Slice& key, std::unique_ptr iter(new MockTableIterator(table_)); for (iter->Seek(key); iter->Valid(); iter->Next()) { ParsedInternalKey parsed_key; - if (ParseInternalKey(iter->key(), &parsed_key) != Status::OK()) { - return Status::Corruption(Slice()); + Status pik_status = + ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */); + if (!pik_status.ok()) { + return pik_status; } bool dont_care __attribute__((__unused__)); @@ -233,7 +235,11 @@ Status MockTableFactory::NewTableReader( std::unique_ptr&& file, uint64_t /*file_size*/, std::unique_ptr* table_reader, bool /*prefetch_index_and_filter_in_cache*/) const { - uint32_t id = GetIDFromFile(file.get()); + uint32_t id; + Status s = GetIDFromFile(file.get(), &id); + if (!s.ok()) { + return s; + } MutexLock lock_guard(&file_system_.mutex); @@ -249,42 +255,46 @@ Status MockTableFactory::NewTableReader( TableBuilder* MockTableFactory::NewTableBuilder( const TableBuilderOptions& /*table_builder_options*/, - uint32_t /*column_family_id*/, WritableFileWriter* file) const { - uint32_t id = GetAndWriteNextID(file); + WritableFileWriter* file) const { + uint32_t id; + Status s = GetAndWriteNextID(file, &id); + assert(s.ok()); return new MockTableBuilder(id, &file_system_, corrupt_mode_); } Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, KVVector file_contents) { - std::unique_ptr file; - auto s = env->NewWritableFile(fname, &file, EnvOptions()); + std::unique_ptr file_writer; + auto s = WritableFileWriter::Create(env->GetFileSystem(), fname, + FileOptions(), &file_writer, nullptr); if (!s.ok()) { return s; } - - WritableFileWriter file_writer(NewLegacyWritableFileWrapper(std::move(file)), - fname, EnvOptions()); - - uint32_t id = GetAndWriteNextID(&file_writer); - file_system_.files.insert({id, std::move(file_contents)}); - return Status::OK(); + uint32_t id; + s = GetAndWriteNextID(file_writer.get(), &id); + if (s.ok()) { + file_system_.files.insert({id, std::move(file_contents)}); + } + return s; } -uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const { - uint32_t next_id = next_id_.fetch_add(1); +Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file, + uint32_t* next_id) const { + *next_id = next_id_.fetch_add(1); char buf[4]; - EncodeFixed32(buf, next_id); - file->Append(Slice(buf, 4)); - return next_id; + EncodeFixed32(buf, *next_id); + return file->Append(Slice(buf, 4)); } -uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { +Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file, + uint32_t* id) const { char buf[4]; Slice result; - file->Read(IOOptions(), 0, 4, &result, buf, nullptr); + Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr); assert(result.size() == 4); - return DecodeFixed32(buf); + *id = DecodeFixed32(buf); + return s; } void MockTableFactory::AssertSingleFile(const KVVector& file_contents) { @@ -303,8 +313,9 @@ void MockTableFactory::AssertLatestFile(const KVVector& file_contents) { ParsedInternalKey ikey; std::string key, value; std::tie(key, value) = kv; - ASSERT_OK(ParseInternalKey(Slice(key), &ikey)); - std::cout << ikey.DebugString(false) << " -> " << value << std::endl; + ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */)); + std::cout << ikey.DebugString(true, false) << " -> " << value + << std::endl; } FAIL(); } diff --git a/table/mock_table.h b/table/mock_table.h index 0ab9674d6e9..095f6334101 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -31,7 +31,8 @@ using KVPair = std::pair; using KVVector = std::vector; KVVector MakeMockFile(std::initializer_list l = {}); -void SortKVVector(KVVector* kv_vector); +void SortKVVector(KVVector* kv_vector, + const Comparator* ucmp = BytewiseComparator()); struct MockTableFileSystem { port::Mutex mutex; @@ -57,7 +58,7 @@ class MockTableFactory : public TableFactory { bool prefetch_index_and_filter_in_cache = true) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_familly_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; // This function will directly create mock table instead of going through // MockTableBuilder. file_contents has to have a format of next_id_; diff --git a/table/multiget_context.h b/table/multiget_context.h index 604a26f8b4e..1c9f8da940b 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -7,6 +7,8 @@ #include #include #include + +#include "db/dbformat.h" #include "db/lookup_key.h" #include "db/merge_context.h" #include "rocksdb/env.h" @@ -21,13 +23,15 @@ class GetContext; struct KeyContext { const Slice* key; LookupKey* lkey; - Slice ukey; + Slice ukey_with_ts; + Slice ukey_without_ts; Slice ikey; ColumnFamilyHandle* column_family; Status* s; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq; bool key_exists; + bool is_blob_index; void* cb_arg; PinnableSlice* value; std::string* timestamp; @@ -41,6 +45,7 @@ struct KeyContext { s(stat), max_covering_tombstone_seq(0), key_exists(false), + is_blob_index(false), cb_arg(nullptr), value(val), timestamp(ts), @@ -110,7 +115,10 @@ class MultiGetContext { sorted_keys_[iter] = (*sorted_keys)[begin + iter]; sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter]) LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp); - sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key(); + sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key(); + sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey( + sorted_keys_[iter]->lkey->user_key(), + read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size()); sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key(); } } diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index faebcfe2f03..3a1f0a41bcc 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -57,9 +57,8 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const std::vector>* - int_tbl_prop_collector_factories, + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const IntTblPropCollectorFactoryRange& int_tbl_prop_collector_factories, uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, const std::string& column_family_name, @@ -100,6 +99,10 @@ PlainTableBuilder::PlainTableBuilder( properties_.column_family_name = column_family_name; properties_.db_id = db_id; properties_.db_session_id = db_session_id; + properties_.db_host_id = ioptions.db_host_id; + if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) { + ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set"); + } properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr ? moptions_.prefix_extractor->Name() : "nullptr"; @@ -109,9 +112,12 @@ PlainTableBuilder::PlainTableBuilder( properties_.user_collected_properties [PlainTablePropertyNames::kEncodingType] = val; - for (auto& collector_factories : *int_tbl_prop_collector_factories) { + for (auto it = int_tbl_prop_collector_factories.first; + it != int_tbl_prop_collector_factories.second; ++it) { + assert(*it); + table_properties_collectors_.emplace_back( - collector_factories->CreateIntTblPropCollector(column_family_id)); + (*it)->CreateIntTblPropCollector(column_family_id)); } } @@ -128,7 +134,8 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { size_t meta_bytes_buf_size = 0; ParsedInternalKey internal_key; - if (ParseInternalKey(key, &internal_key) != Status::OK()) { + if (!ParseInternalKey(key, &internal_key, false /* log_err_key */) + .ok()) { // TODO assert(false); return; } @@ -188,7 +195,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // notify property collectors NotifyCollectTableCollectorsOnAdd( - key, value, offset_, table_properties_collectors_, ioptions_.info_log); + key, value, offset_, table_properties_collectors_, ioptions_.logger); status_ = io_status_; } @@ -209,13 +216,12 @@ Status PlainTableBuilder::Finish() { if (store_index_in_file_ && (properties_.num_entries > 0)) { assert(properties_.num_entries <= std::numeric_limits::max()); - Status s; BlockHandle bloom_block_handle; if (bloom_bits_per_key_ > 0) { bloom_block_.SetTotalBits( &arena_, static_cast(properties_.num_entries) * bloom_bits_per_key_, - ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log); + ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger); PutVarint32(&properties_.user_collected_properties [PlainTablePropertyNames::kNumBloomBlocks], @@ -259,9 +265,8 @@ Status PlainTableBuilder::Finish() { property_block_builder.Add(properties_.user_collected_properties); // -- Add user collected properties - NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, - ioptions_.info_log, - &property_block_builder); + NotifyCollectTableCollectorsOnFinish( + table_properties_collectors_, ioptions_.logger, &property_block_builder); // -- Write property block BlockHandle property_block_handle; diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h index 6ab5d59e305..7305cb15349 100644 --- a/table/plain/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -37,9 +37,8 @@ class PlainTableBuilder: public TableBuilder { // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. PlainTableBuilder( - const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions, - const std::vector>* - int_tbl_prop_collector_factories, + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, + const IntTblPropCollectorFactoryRange& int_tbl_prop_collector_factories, uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_size, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, @@ -96,7 +95,7 @@ class PlainTableBuilder: public TableBuilder { private: Arena arena_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; const MutableCFOptions& moptions_; std::vector> table_properties_collectors_; diff --git a/table/plain/plain_table_factory.cc b/table/plain/plain_table_factory.cc index e0d0e69f68d..dd345492dfe 100644 --- a/table/plain/plain_table_factory.cc +++ b/table/plain/plain_table_factory.cc @@ -52,8 +52,7 @@ static std::unordered_map plain_table_type_info = { PlainTableFactory::PlainTableFactory(const PlainTableOptions& options) : table_options_(options) { - ConfigurableHelper::RegisterOptions(*this, &table_options_, - &plain_table_type_info); + RegisterOptions(&table_options_, &plain_table_type_info); } Status PlainTableFactory::NewTableReader( @@ -71,7 +70,7 @@ Status PlainTableFactory::NewTableReader( } TableBuilder* PlainTableFactory::NewTableBuilder( - const TableBuilderOptions& table_builder_options, uint32_t column_family_id, + const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const { // Ignore the skip_filters flag. PlainTable format is optimized for small // in-memory dbs. The skip_filters optimization is not useful for plain @@ -79,9 +78,10 @@ TableBuilder* PlainTableFactory::NewTableBuilder( // return new PlainTableBuilder( table_builder_options.ioptions, table_builder_options.moptions, - table_builder_options.int_tbl_prop_collector_factories, column_family_id, - file, table_options_.user_key_len, table_options_.encoding_type, - table_options_.index_sparseness, table_options_.bloom_bits_per_key, + table_builder_options.int_tbl_prop_collector_factories, + table_builder_options.column_family_id, file, table_options_.user_key_len, + table_options_.encoding_type, table_options_.index_sparseness, + table_options_.bloom_bits_per_key, table_builder_options.column_family_name, 6, table_options_.huge_page_tlb_size, table_options_.hash_table_ratio, table_options_.store_index_in_file, table_builder_options.db_id, diff --git a/table/plain/plain_table_factory.h b/table/plain/plain_table_factory.h index 61a1ed935cb..e482403277d 100644 --- a/table/plain/plain_table_factory.h +++ b/table/plain/plain_table_factory.h @@ -156,6 +156,8 @@ class PlainTableFactory : public TableFactory { explicit PlainTableFactory( const PlainTableOptions& _table_options = PlainTableOptions()); + // Method to allow CheckedCast to work for this class + static const char* kClassName() { return kPlainTableName(); } const char* Name() const override { return kPlainTableName(); } using TableFactory::NewTableReader; Status NewTableReader(const ReadOptions& ro, @@ -166,7 +168,7 @@ class PlainTableFactory : public TableFactory { TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - uint32_t column_family_id, WritableFileWriter* file) const override; + WritableFileWriter* file) const override; std::string GetPrintableOptions() const override; static const char kValueTypeSeqId0 = char(~0); diff --git a/table/plain/plain_table_index.cc b/table/plain/plain_table_index.cc index 1099dfa6e43..f9f700e6e73 100644 --- a/table/plain/plain_table_index.cc +++ b/table/plain/plain_table_index.cc @@ -98,7 +98,7 @@ Slice PlainTableIndexBuilder::Finish() { BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); keys_per_prefix_hist_.Add(num_keys_per_prefix_); - ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s", + ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist_.ToString().c_str()); // From the temp data structure, populate indexes. @@ -153,12 +153,12 @@ void PlainTableIndexBuilder::BucketizeIndexes( Slice PlainTableIndexBuilder::FillIndexes( const std::vector& hash_to_offsets, const std::vector& entries_per_bucket) { - ROCKS_LOG_DEBUG(ioptions_.info_log, + ROCKS_LOG_DEBUG(ioptions_.logger, "Reserving %" PRIu32 " bytes for plain table's sub_index", sub_index_size_); auto total_allocate_size = GetTotalSize(); char* allocated = arena_->AllocateAligned( - total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); + total_allocate_size, huge_page_tlb_size_, ioptions_.logger); auto temp_ptr = EncodeVarint32(allocated, index_size_); uint32_t* index = @@ -198,7 +198,7 @@ Slice PlainTableIndexBuilder::FillIndexes( } assert(sub_index_offset == sub_index_size_); - ROCKS_LOG_DEBUG(ioptions_.info_log, + ROCKS_LOG_DEBUG(ioptions_.logger, "hash table size: %" PRIu32 ", suffix_map length %" PRIu32, index_size_, sub_index_size_); return Slice(allocated, GetTotalSize()); diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h index 1202a2f566f..3ef0705d484 100644 --- a/table/plain/plain_table_index.h +++ b/table/plain/plain_table_index.h @@ -131,7 +131,7 @@ class PlainTableIndex { // The class is used by PlainTableBuilder class. class PlainTableIndexBuilder { public: - PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, + PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor, size_t index_sparseness, double hash_table_ratio, size_t huge_page_tlb_size) @@ -222,7 +222,7 @@ class PlainTableIndexBuilder { const std::vector& entries_per_bucket); Arena* arena_; - const ImmutableCFOptions ioptions_; + const ImmutableOptions ioptions_; HistogramImpl keys_per_prefix_hist_; IndexRecordList record_list_; bool is_first_record_; diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index 39feb8dd050..e3a76f89ea2 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -85,8 +85,10 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key, uint64_t* offset, char* meta_bytes_buf, size_t* meta_bytes_buf_size) { ParsedInternalKey parsed_key; - if (ParseInternalKey(key, &parsed_key) != Status::OK()) { - return IOStatus::Corruption(Slice()); + Status pik_status = + ParseInternalKey(key, &parsed_key, false /* log_err_key */); // TODO + if (!pik_status.ok()) { + return IOStatus::Corruption(pik_status.getState()); } Slice key_to_write = key; // Portion of internal key to write out. @@ -279,9 +281,12 @@ Status PlainTableKeyDecoder::ReadInternalKey( return file_reader_.status(); } *internal_key_valid = true; - if (ParseInternalKey(*internal_key, parsed_key) != Status::OK()) { + Status pik_status = ParseInternalKey(*internal_key, parsed_key, + false /* log_err_key */); // TODO + if (!pik_status.ok()) { return Status::Corruption( - Slice("Incorrect value type found when reading the next key")); + Slice("Corrupted key found during next key read. "), + pik_status.getState()); } *bytes_read += user_key_size + 8; } diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index e08174948c9..fbd62426862 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -93,7 +93,7 @@ class PlainTableIterator : public InternalIterator { extern const uint64_t kPlainTableMagicNumber; PlainTableReader::PlainTableReader( - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, std::unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, uint64_t file_size, @@ -118,7 +118,7 @@ PlainTableReader::~PlainTableReader() { } Status PlainTableReader::Open( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const ImmutableOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, const int bloom_bits_per_key, @@ -277,7 +277,7 @@ void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys, if (bloom_total_bits > 0) { enable_bloom_ = true; bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, - huge_page_tlb_size, ioptions_.info_log); + huge_page_tlb_size, ioptions_.logger); } } @@ -457,7 +457,8 @@ Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, uint32_t high = upper_bound; ParsedInternalKey mid_key; ParsedInternalKey parsed_target; - Status s = ParseInternalKey(target, &parsed_target); + Status s = ParseInternalKey(target, &parsed_target, + false /* log_err_key */); // TODO if (!s.ok()) return s; // The key is between [low, high). Do a binary search between it. @@ -593,8 +594,9 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, } ParsedInternalKey found_key; ParsedInternalKey parsed_target; - s = ParseInternalKey(target, &parsed_target); - if (!s.ok()) return Status::Corruption(Slice()); + s = ParseInternalKey(target, &parsed_target, + false /* log_err_key */); // TODO + if (!s.ok()) return s; Slice found_value; while (offset < file_info_.data_end_offset) { diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index e3b12a9c32b..98bfa59df60 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -67,7 +67,7 @@ class PlainTableReader: public TableReader { // whether it points to the data offset of the first key with the key prefix // or the offset of it. If there are too many keys share this prefix, it will // create a binary search-able index from the suffix to offset on disk. - static Status Open(const ImmutableCFOptions& ioptions, + static Status Open(const ImmutableOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, @@ -110,7 +110,7 @@ class PlainTableReader: public TableReader { return arena_.MemoryAllocatedBytes(); } - PlainTableReader(const ImmutableCFOptions& ioptions, + PlainTableReader(const ImmutableOptions& ioptions, std::unique_ptr&& file, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, @@ -163,7 +163,7 @@ class PlainTableReader: public TableReader { CacheAllocationPtr index_block_alloc_; CacheAllocationPtr bloom_block_alloc_; - const ImmutableCFOptions& ioptions_; + const ImmutableOptions& ioptions_; std::unique_ptr dummy_cleanable_; uint64_t file_size_; protected: // for testing diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index cff9dd970ef..d2ade5bfeaf 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -18,7 +18,6 @@ #include "db/blob/blob_index.h" #include "db/memtable.h" #include "db/write_batch_internal.h" -#include "env/composite_env_wrapper.h" #include "options/cf_options.h" #include "port/port.h" #include "rocksdb/db.h" @@ -80,11 +79,13 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { // read table magic number Footer footer; - std::unique_ptr file; + const auto& fs = options_.env->GetFileSystem(); + std::unique_ptr file; uint64_t file_size = 0; - Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_); + Status s = fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file, + nullptr); if (s.ok()) { - s = options_.env->GetFileSize(file_path, &file_size); + s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); } // check empty file @@ -93,8 +94,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { return Status::Aborted(file_path, "Empty file"); } - file_.reset(new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file), - file_path)); + file_.reset(new RandomAccessFileReader(std::move(file), file_path)); FilePrefetchBuffer prefetch_buffer(nullptr, 0, 0, true /* enable */, false /* track_min_offset */); @@ -119,9 +119,10 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { if (magic_number == kPlainTableMagicNumber || magic_number == kLegacyPlainTableMagicNumber) { soptions_.use_mmap_reads = true; - options_.env->NewRandomAccessFile(file_path, &file, soptions_); - file_.reset(new RandomAccessFileReader( - NewLegacyRandomAccessFileWrapper(file), file_path)); + + fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file, + nullptr); + file_.reset(new RandomAccessFileReader(std::move(file), file_path)); } options_.comparator = &internal_comparator_; // For old sst format, ReadTableProperties might fail but file can be read @@ -144,7 +145,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { } Status SstFileDumper::NewTableReader( - const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/, + const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { auto t_opt = @@ -177,8 +178,10 @@ Status SstFileDumper::VerifyChecksum() { Status SstFileDumper::DumpTable(const std::string& out_filename) { std::unique_ptr out_file; Env* env = options_.env; - env->NewWritableFile(out_filename, &out_file, soptions_); - Status s = table_reader_->DumpTable(out_file.get()); + Status s = env->NewWritableFile(out_filename, &out_file, soptions_); + if (s.ok()) { + s = table_reader_->DumpTable(out_file.get()); + } if (!s.ok()) { // close the file before return error, ignore the close error if there's any out_file->Close().PermitUncheckedError(); @@ -190,23 +193,20 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) { Status SstFileDumper::CalculateCompressedTableSize( const TableBuilderOptions& tb_options, size_t block_size, uint64_t* num_data_blocks, uint64_t* compressed_table_size) { - std::unique_ptr out_file; std::unique_ptr env(NewMemEnv(options_.env)); - Status s = env->NewWritableFile(testFileName, &out_file, soptions_); + std::unique_ptr dest_writer; + Status s = + WritableFileWriter::Create(env->GetFileSystem(), testFileName, + FileOptions(soptions_), &dest_writer, nullptr); if (!s.ok()) { return s; } - std::unique_ptr dest_writer; - dest_writer.reset( - new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(out_file)), - testFileName, soptions_)); BlockBasedTableOptions table_options; table_options.block_size = block_size; BlockBasedTableFactory block_based_tf(table_options); std::unique_ptr table_builder; table_builder.reset(block_based_tf.NewTableBuilder( tb_options, - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, dest_writer.get())); std::unique_ptr iter(table_reader_->NewIterator( read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, @@ -233,7 +233,8 @@ Status SstFileDumper::ShowAllCompressionSizes( const std::vector>& compression_types, int32_t compress_level_from, int32_t compress_level_to, - uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes) { + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, + uint64_t max_dict_buffer_bytes) { fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size); for (auto& i : compression_types) { if (CompressionTypeSupported(i.first)) { @@ -241,6 +242,7 @@ Status SstFileDumper::ShowAllCompressionSizes( CompressionOptions compress_opt; compress_opt.max_dict_bytes = max_dict_bytes; compress_opt.zstd_max_train_bytes = zstd_max_train_bytes; + compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes; for (int32_t j = compress_level_from; j <= compress_level_to; j++) { fprintf(stdout, "Compression level: %d", j); compress_opt.level = j; @@ -262,18 +264,18 @@ Status SstFileDumper::ShowCompressionSize( Options opts; opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); opts.statistics->set_stats_level(StatsLevel::kAll); - const ImmutableCFOptions imoptions(opts); + const ImmutableOptions imoptions(opts); const ColumnFamilyOptions cfo(opts); const MutableCFOptions moptions(cfo); ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator); - std::vector> - block_based_table_factories; + IntTblPropCollectorFactories block_based_table_factories; std::string column_family_name; int unknown_level = -1; TableBuilderOptions tb_opts( imoptions, moptions, ikc, &block_based_table_factories, compress_type, - 0 /* sample_for_compression */, compress_opt, false /* skip_filters */, + compress_opt, + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, column_family_name, unknown_level); uint64_t num_data_blocks = 0; std::chrono::steady_clock::time_point start = @@ -441,9 +443,9 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, if (read_num > 0 && i > read_num) break; ParsedInternalKey ikey; - if (ParseInternalKey(key, &ikey) != Status::OK()) { - std::cerr << "Internal Key [" << key.ToString(true /* in hex*/) - << "] parse error!\n"; + Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */); + if (!pik_status.ok()) { + std::cerr << pik_status.getState() << "\n"; continue; } @@ -459,7 +461,8 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, if (print_kv) { if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) { - fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(), + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), value.ToString(output_hex_).c_str()); } else { BlobIndex blob_index; @@ -467,11 +470,12 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, const Status s = blob_index.DecodeFrom(value); if (!s.ok()) { fprintf(stderr, "%s => error decoding blob index\n", - ikey.DebugString(output_hex_).c_str()); + ikey.DebugString(true, output_hex_).c_str()); continue; } - fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(), + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), blob_index.DebugString(output_hex_).c_str()); } } diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h index 9153f8a3800..32aa7be9474 100644 --- a/table/sst_file_dumper.h +++ b/table/sst_file_dumper.h @@ -40,7 +40,8 @@ class SstFileDumper { const std::vector>& compression_types, int32_t compress_level_from, int32_t compress_level_to, - uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes); + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes, + uint64_t max_dict_buffer_bytes); Status ShowCompressionSize(size_t block_size, CompressionType compress_type, const CompressionOptions& compress_opt); @@ -62,7 +63,7 @@ class SstFileDumper { // Helper function to call the factory with settings specific to the // factory implementation - Status NewTableReader(const ImmutableCFOptions& ioptions, + Status NewTableReader(const ImmutableOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, uint64_t file_size, @@ -84,7 +85,7 @@ class SstFileDumper { std::unique_ptr table_reader_; std::unique_ptr file_; - const ImmutableCFOptions ioptions_; + const ImmutableOptions ioptions_; const MutableCFOptions moptions_; ReadOptions read_options_; InternalKeyComparator internal_comparator_; diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index f7f22b06110..e106bca9d65 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -10,9 +10,10 @@ #include "db/arena_wrapped_db_iter.h" #include "db/db_iter.h" #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/random_access_file_reader.h" #include "options/cf_options.h" +#include "rocksdb/env.h" +#include "rocksdb/file_system.h" #include "table/get_context.h" #include "table/table_builder.h" #include "table/table_reader.h" @@ -22,7 +23,7 @@ namespace ROCKSDB_NAMESPACE { struct SstFileReader::Rep { Options options; EnvOptions soptions; - ImmutableCFOptions ioptions; + ImmutableOptions ioptions; MutableCFOptions moptions; std::unique_ptr table_reader; @@ -42,15 +43,17 @@ Status SstFileReader::Open(const std::string& file_path) { auto r = rep_.get(); Status s; uint64_t file_size = 0; - std::unique_ptr file; + std::unique_ptr file; std::unique_ptr file_reader; - s = r->options.env->GetFileSize(file_path, &file_size); + FileOptions fopts(r->soptions); + const auto& fs = r->options.env->GetFileSystem(); + + s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr); if (s.ok()) { - s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions); + s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr); } if (s.ok()) { - file_reader.reset(new RandomAccessFileReader( - NewLegacyRandomAccessFileWrapper(file), file_path)); + file_reader.reset(new RandomAccessFileReader(std::move(file), file_path)); } if (s.ok()) { TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(), @@ -69,11 +72,12 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { ? roptions.snapshot->GetSequenceNumber() : kMaxSequenceNumber; ArenaWrappedDBIter* res = new ArenaWrappedDBIter(); - res->Init(r->options.env, roptions, r->ioptions, r->moptions, sequence, + res->Init(r->options.env, roptions, r->ioptions, r->moptions, + nullptr /* version */, sequence, r->moptions.max_sequential_skip_in_iterations, 0 /* version_number */, nullptr /* read_callback */, - nullptr /* db_impl */, nullptr /* cfd */, false /* allow_blob */, - false /* allow_refresh */); + nullptr /* db_impl */, nullptr /* cfd */, + true /* expose_blob_index */, false /* allow_refresh */); auto internal_iter = r->table_reader->NewIterator( res->GetReadOptions(), r->moptions.prefix_extractor.get(), res->GetArena(), false /* skip_filters */, diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 8a63b69bdd4..52cab2ab36e 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -5,11 +5,13 @@ #ifndef ROCKSDB_LITE +#include "rocksdb/sst_file_reader.h" + #include #include "port/stack_trace.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" -#include "rocksdb/sst_file_reader.h" #include "rocksdb/sst_file_writer.h" #include "table/sst_file_writer_collectors.h" #include "test_util/testharness.h" @@ -37,14 +39,8 @@ class SstFileReaderTest : public testing::Test { sst_name_ = test::PerThreadDBPath("sst_file"); Env* base_env = Env::Default(); - const char* test_env_uri = getenv("TEST_ENV_URI"); - if(test_env_uri) { - Env* test_env = nullptr; - Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_); - base_env = test_env; - EXPECT_OK(s); - EXPECT_NE(Env::Default(), base_env); - } + EXPECT_OK( + test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); EXPECT_NE(nullptr, base_env); env_ = base_env; options_.env = env_; @@ -90,6 +86,9 @@ class SstFileReaderTest : public testing::Test { if (check_global_seqno) { auto properties = reader.GetTableProperties(); ASSERT_TRUE(properties); + std::string hostname; + ASSERT_OK(env_->GetHostNameString(&hostname)); + ASSERT_EQ(properties->db_host_id, hostname); auto& user_properties = properties->user_collected_properties; ASSERT_TRUE( user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno)); diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index f6583beac80..9943296e991 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -8,8 +8,8 @@ #include #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/writable_file_writer.h" +#include "rocksdb/file_system.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_builder.h" #include "table/sst_file_writer_collectors.h" @@ -46,7 +46,7 @@ struct SstFileWriter::Rep { std::unique_ptr file_writer; std::unique_ptr builder; EnvOptions env_options; - ImmutableCFOptions ioptions; + ImmutableOptions ioptions; MutableCFOptions mutable_cf_options; Env::IOPriority io_priority; InternalKeyComparator internal_comparator; @@ -104,7 +104,8 @@ struct SstFileWriter::Rep { file_info.largest_key.assign(user_key.data(), user_key.size()); file_info.file_size = builder->FileSize(); - return InvalidatePageCache(false /* closing */); + InvalidatePageCache(false /* closing */).PermitUncheckedError(); + return Status::OK(); } Status DeleteRange(const Slice& begin_key, const Slice& end_key) { @@ -138,7 +139,8 @@ struct SstFileWriter::Rep { file_info.num_range_del_entries++; file_info.file_size = builder->FileSize(); - return InvalidatePageCache(false /* closing */); + InvalidatePageCache(false /* closing */).PermitUncheckedError(); + return Status::OK(); } Status InvalidatePageCache(bool closing) { @@ -189,8 +191,10 @@ SstFileWriter::~SstFileWriter() { Status SstFileWriter::Open(const std::string& file_path) { Rep* r = rep_.get(); Status s; - std::unique_ptr sst_file; - s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options); + std::unique_ptr sst_file; + FileOptions cur_file_opts(r->env_options); + s = r->ioptions.env->GetFileSystem()->NewWritableFile( + file_path, cur_file_opts, &sst_file, nullptr); if (!s.ok()) { return s; } @@ -215,11 +219,8 @@ Status SstFileWriter::Open(const std::string& file_path) { compression_type = r->mutable_cf_options.compression; compression_opts = r->mutable_cf_options.compression_opts; } - uint64_t sample_for_compression = - r->mutable_cf_options.sample_for_compression; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; // SstFileWriter properties collector to add SstFileWriter version. int_tbl_prop_collector_factories.emplace_back( @@ -257,21 +258,25 @@ Status SstFileWriter::Open(const std::string& file_path) { } TableBuilderOptions table_builder_options( r->ioptions, r->mutable_cf_options, r->internal_comparator, - &int_tbl_prop_collector_factories, compression_type, - sample_for_compression, compression_opts, r->skip_filters, - r->column_family_name, unknown_level, 0 /* creation_time */, - 0 /* oldest_key_time */, 0 /* target_file_size */, - 0 /* file_creation_time */, "SST Writer" /* db_id */, db_session_id); + &int_tbl_prop_collector_factories, compression_type, compression_opts, + cf_id, r->column_family_name, unknown_level, false /* is_bottommost */, + TableFileCreationReason::kMisc, 0 /* creation_time */, + 0 /* oldest_key_time */, 0 /* file_creation_time */, + "SST Writer" /* db_id */, db_session_id, 0 /* target_file_size */, 0); + // XXX: when we can remove skip_filters from the SstFileWriter public API + // we can remove it from TableBuilderOptions. + table_builder_options.skip_filters = r->skip_filters; + FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types; r->file_writer.reset(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(sst_file)), file_path, - r->env_options, r->ioptions.env, nullptr /* io_tracer */, - nullptr /* stats */, r->ioptions.listeners, - r->ioptions.file_checksum_gen_factory)); + std::move(sst_file), file_path, r->env_options, r->ioptions.clock, + nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners, + r->ioptions.file_checksum_gen_factory.get(), + tmp_set.Contains(FileType::kTableFile))); // TODO(tec) : If table_factory is using compressed block cache, we will // be adding the external sst file blocks into it, which is wasteful. r->builder.reset(r->ioptions.table_factory->NewTableBuilder( - table_builder_options, cf_id, r->file_writer.get())); + table_builder_options, r->file_writer.get())); r->file_info = ExternalSstFileInfo(); r->file_info.file_path = file_path; @@ -318,9 +323,7 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { s = r->file_writer->Sync(r->ioptions.use_fsync); } if (s.ok()) { - s = r->InvalidatePageCache(true /* closing */); - } - if (s.ok()) { + r->InvalidatePageCache(true /* closing */).PermitUncheckedError(); s = r->file_writer->Close(); } } diff --git a/table/sst_file_writer_collectors.h b/table/sst_file_writer_collectors.h index 01ecec97176..2dbd611ab6f 100644 --- a/table/sst_file_writer_collectors.h +++ b/table/sst_file_writer_collectors.h @@ -35,9 +35,9 @@ class SstFileWriterPropertiesCollector : public IntTblPropCollector { return Status::OK(); } - virtual void BlockAdd(uint64_t /* blockRawBytes */, - uint64_t /* blockCompressedBytesFast */, - uint64_t /* blockCompressedBytesSlow */) override { + virtual void BlockAdd(uint64_t /* block_raw_bytes */, + uint64_t /* block_compressed_bytes_fast */, + uint64_t /* block_compressed_bytes_slow */) override { // Intentionally left blank. No interest in collecting stats for // blocks. return; diff --git a/table/table_builder.h b/table/table_builder.h index 36475c14376..f22b10750af 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -10,9 +10,11 @@ #pragma once #include + #include #include #include + #include "db/dbformat.h" #include "db/table_properties_collector.h" #include "file/writable_file_writer.h" @@ -28,22 +30,25 @@ class Status; struct TableReaderOptions { // @param skip_filters Disables loading/accessing the filter block - TableReaderOptions(const ImmutableCFOptions& _ioptions, + TableReaderOptions(const ImmutableOptions& _ioptions, const SliceTransform* _prefix_extractor, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, bool _skip_filters = false, bool _immortal = false, bool _force_direct_prefetch = false, int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, - size_t _max_file_size_for_l0_meta_pin = 0) - : TableReaderOptions(_ioptions, _prefix_extractor, _env_options, - _internal_comparator, _skip_filters, _immortal, - _force_direct_prefetch, _level, - 0 /* _largest_seqno */, _block_cache_tracer, - _max_file_size_for_l0_meta_pin) {} + size_t _max_file_size_for_l0_meta_pin = 0, + const std::string& _cur_db_session_id = "", + uint64_t _cur_file_num = 0) + : TableReaderOptions( + _ioptions, _prefix_extractor, _env_options, _internal_comparator, + _skip_filters, _immortal, _force_direct_prefetch, _level, + 0 /* _largest_seqno */, _block_cache_tracer, + _max_file_size_for_l0_meta_pin, _cur_db_session_id, _cur_file_num) { + } // @param skip_filters Disables loading/accessing the filter block - TableReaderOptions(const ImmutableCFOptions& _ioptions, + TableReaderOptions(const ImmutableOptions& _ioptions, const SliceTransform* _prefix_extractor, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, @@ -51,7 +56,9 @@ struct TableReaderOptions { bool _force_direct_prefetch, int _level, SequenceNumber _largest_seqno, BlockCacheTracer* const _block_cache_tracer, - size_t _max_file_size_for_l0_meta_pin) + size_t _max_file_size_for_l0_meta_pin, + const std::string& _cur_db_session_id, + uint64_t _cur_file_num) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -62,9 +69,11 @@ struct TableReaderOptions { level(_level), largest_seqno(_largest_seqno), block_cache_tracer(_block_cache_tracer), - max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin) {} + max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin), + cur_db_session_id(_cur_db_session_id), + cur_file_num(_cur_file_num) {} - const ImmutableCFOptions& ioptions; + const ImmutableOptions& ioptions; const SliceTransform* prefix_extractor; const EnvOptions& env_options; const InternalKeyComparator& internal_comparator; @@ -76,7 +85,8 @@ struct TableReaderOptions { // fetch into RocksDB's buffer, rather than relying // RandomAccessFile::Prefetch(). bool force_direct_prefetch; - // what level this table/file is on, -1 for "not set, don't know" + // What level this table/file is on, -1 for "not set, don't know." Used + // for level-specific statistics. int level; // largest seqno in the table SequenceNumber largest_seqno; @@ -84,55 +94,93 @@ struct TableReaderOptions { // Largest L0 file size whose meta-blocks may be pinned (can be zero when // unknown). const size_t max_file_size_for_l0_meta_pin; + + std::string cur_db_session_id; + + uint64_t cur_file_num; }; struct TableBuilderOptions { TableBuilderOptions( - const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions, + const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions, const InternalKeyComparator& _internal_comparator, - const std::vector>* - _int_tbl_prop_collector_factories, - CompressionType _compression_type, uint64_t _sample_for_compression, - const CompressionOptions& _compression_opts, bool _skip_filters, + const IntTblPropCollectorFactoryRange& _int_tbl_prop_collector_factories, + CompressionType _compression_type, + const CompressionOptions& _compression_opts, uint32_t _column_family_id, const std::string& _column_family_name, int _level, + bool _is_bottommost = false, + TableFileCreationReason _reason = TableFileCreationReason::kMisc, const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0, - const uint64_t _target_file_size = 0, const uint64_t _file_creation_time = 0, const std::string& _db_id = "", - const std::string& _db_session_id = "") + const std::string& _db_session_id = "", + const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0) : ioptions(_ioptions), moptions(_moptions), internal_comparator(_internal_comparator), int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories), compression_type(_compression_type), - sample_for_compression(_sample_for_compression), compression_opts(_compression_opts), - skip_filters(_skip_filters), + column_family_id(_column_family_id), column_family_name(_column_family_name), - level(_level), creation_time(_creation_time), oldest_key_time(_oldest_key_time), target_file_size(_target_file_size), file_creation_time(_file_creation_time), db_id(_db_id), - db_session_id(_db_session_id) {} + db_session_id(_db_session_id), + level_at_creation(_level), + is_bottommost(_is_bottommost), + reason(_reason), + cur_file_num(_cur_file_num) {} - const ImmutableCFOptions& ioptions; + TableBuilderOptions( + const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions, + const InternalKeyComparator& _internal_comparator, + const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories, + CompressionType _compression_type, + const CompressionOptions& _compression_opts, uint32_t _column_family_id, + const std::string& _column_family_name, int _level, + bool _is_bottommost = false, + TableFileCreationReason _reason = TableFileCreationReason::kMisc, + const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0, + const uint64_t _file_creation_time = 0, const std::string& _db_id = "", + const std::string& _db_session_id = "", + const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0) + : TableBuilderOptions(_ioptions, _moptions, _internal_comparator, + IntTblPropCollectorFactoryRange( + _int_tbl_prop_collector_factories->begin(), + _int_tbl_prop_collector_factories->end()), + _compression_type, _compression_opts, + _column_family_id, _column_family_name, _level, + _is_bottommost, _reason, _creation_time, + _oldest_key_time, _file_creation_time, _db_id, + _db_session_id, _target_file_size, _cur_file_num) {} + + const ImmutableOptions& ioptions; const MutableCFOptions& moptions; const InternalKeyComparator& internal_comparator; - const std::vector>* - int_tbl_prop_collector_factories; - CompressionType compression_type; - uint64_t sample_for_compression; + const IntTblPropCollectorFactoryRange int_tbl_prop_collector_factories; + const CompressionType compression_type; const CompressionOptions& compression_opts; - bool skip_filters; // only used by BlockBasedTableBuilder + const uint32_t column_family_id; const std::string& column_family_name; - int level; // what level this table/file is on, -1 for "not set, don't know" const uint64_t creation_time; const int64_t oldest_key_time; const uint64_t target_file_size; const uint64_t file_creation_time; const std::string db_id; const std::string db_session_id; + // BEGIN for FilterBuildingContext + const int level_at_creation; + const bool is_bottommost; + const TableFileCreationReason reason; + // END for FilterBuildingContext + + // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you + // want to skip filters, that should be (for example) null filter_policy + // in the table options of the ioptions.table_factory + bool skip_filters = false; + const uint64_t cur_file_num; }; // TableBuilder provides the interface used to build a Table diff --git a/table/table_factory.cc b/table/table_factory.cc index 18935c8591d..962bad9badd 100644 --- a/table/table_factory.cc +++ b/table/table_factory.cc @@ -3,6 +3,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "options/customizable_helper.h" #include "rocksdb/convenience.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_factory.h" @@ -11,23 +12,9 @@ namespace ROCKSDB_NAMESPACE { -Status TableFactory::CreateFromString(const ConfigOptions& config_options_in, - const std::string& id, - std::shared_ptr* factory) { - Status status; - std::string name = id; - - std::string existing_opts; - - ConfigOptions config_options = config_options_in; - if (factory->get() != nullptr && name == factory->get()->Name()) { - config_options.delimiter = ";"; - - status = factory->get()->GetOptionString(config_options, &existing_opts); - if (!status.ok()) { - return status; - } - } +static bool LoadFactory(const std::string& name, + std::shared_ptr* factory) { + bool success = true; if (name == TableFactory::kBlockBasedTableName()) { factory->reset(new BlockBasedTableFactory()); #ifndef ROCKSDB_LITE @@ -37,14 +24,15 @@ Status TableFactory::CreateFromString(const ConfigOptions& config_options_in, factory->reset(new CuckooTableFactory()); #endif // ROCKSDB_LITE } else { - status = Status::NotSupported("Could not load table factory: ", name); - return status; - } - if (!existing_opts.empty()) { - config_options.invoke_prepare_options = false; - status = factory->get()->ConfigureFromString(config_options, existing_opts); + success = false; } - return status; + return success; } +Status TableFactory::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* factory) { + return LoadSharedObject(config_options, value, LoadFactory, + factory); +} } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_properties.cc b/table/table_properties.cc index 622f3d45b17..373c763d6a6 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -111,6 +111,8 @@ std::string TableProperties::ToString( } AppendProperty(result, "filter block size", filter_size, prop_delim, kv_delim); + AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim, + kv_delim); AppendProperty(result, "(estimated) table size", data_size + index_size + filter_size, prop_delim, kv_delim); @@ -168,6 +170,11 @@ std::string TableProperties::ToString( AppendProperty(result, "file creation time", file_creation_time, prop_delim, kv_delim); + AppendProperty(result, "slow compression estimated data size", + slow_compression_estimated_data_size, prop_delim, kv_delim); + AppendProperty(result, "fast compression estimated data size", + fast_compression_estimated_data_size, prop_delim, kv_delim); + // DB identity and DB session ID AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim); AppendProperty(result, "DB session identity", db_session_id, prop_delim, @@ -188,14 +195,44 @@ void TableProperties::Add(const TableProperties& tp) { raw_value_size += tp.raw_value_size; num_data_blocks += tp.num_data_blocks; num_entries += tp.num_entries; + num_filter_entries += tp.num_filter_entries; num_deletions += tp.num_deletions; num_merge_operands += tp.num_merge_operands; num_range_deletions += tp.num_range_deletions; + slow_compression_estimated_data_size += + tp.slow_compression_estimated_data_size; + fast_compression_estimated_data_size += + tp.fast_compression_estimated_data_size; +} + +std::map +TableProperties::GetAggregatablePropertiesAsMap() const { + std::map rv; + rv["data_size"] = data_size; + rv["index_size"] = index_size; + rv["index_partitions"] = index_partitions; + rv["top_level_index_size"] = top_level_index_size; + rv["filter_size"] = filter_size; + rv["raw_key_size"] = raw_key_size; + rv["raw_value_size"] = raw_value_size; + rv["num_data_blocks"] = num_data_blocks; + rv["num_entries"] = num_entries; + rv["num_filter_entries"] = num_filter_entries; + rv["num_deletions"] = num_deletions; + rv["num_merge_operands"] = num_merge_operands; + rv["num_range_deletions"] = num_range_deletions; + rv["slow_compression_estimated_data_size"] = + slow_compression_estimated_data_size; + rv["fast_compression_estimated_data_size"] = + fast_compression_estimated_data_size; + return rv; } const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity"; const std::string TablePropertiesNames::kDbSessionId = "rocksdb.creating.session.identity"; +const std::string TablePropertiesNames::kDbHostId = + "rocksdb.creating.host.identity"; const std::string TablePropertiesNames::kDataSize = "rocksdb.data.size"; const std::string TablePropertiesNames::kIndexSize = @@ -218,6 +255,8 @@ const std::string TablePropertiesNames::kNumDataBlocks = "rocksdb.num.data.blocks"; const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries"; +const std::string TablePropertiesNames::kNumFilterEntries = + "rocksdb.num.filter_entries"; const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; const std::string TablePropertiesNames::kMergeOperands = "rocksdb.merge.operands"; @@ -248,6 +287,10 @@ const std::string TablePropertiesNames::kOldestKeyTime = "rocksdb.oldest.key.time"; const std::string TablePropertiesNames::kFileCreationTime = "rocksdb.file.creation.time"; +const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize = + "rocksdb.sample_for_compression.slow.data.size"; +const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = + "rocksdb.sample_for_compression.fast.data.size"; extern const std::string kPropertiesBlock = "rocksdb.properties"; // Old property block name for backward compatibility diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index f1fd605aa49..df4a750d793 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -13,11 +13,12 @@ int main() { #include "db/db_impl/db_impl.h" #include "db/dbformat.h" -#include "env/composite_env_wrapper.h" #include "file/random_access_file_reader.h" #include "monitoring/histogram.h" #include "rocksdb/db.h" +#include "rocksdb/file_system.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/system_clock.h" #include "rocksdb/table.h" #include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" @@ -50,8 +51,8 @@ static std::string MakeKey(int i, int j, bool through_db) { return key.Encode().ToString(); } -uint64_t Now(Env* env, bool measured_by_nanosecond) { - return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); +uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) { + return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros(); } } // namespace @@ -81,30 +82,28 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db"); WriteOptions wo; Env* env = Env::Default(); + auto* clock = env->GetSystemClock().get(); TableBuilder* tb = nullptr; DB* db = nullptr; Status s; - const ImmutableCFOptions ioptions(opts); + const ImmutableOptions ioptions(opts); const ColumnFamilyOptions cfo(opts); const MutableCFOptions moptions(cfo); std::unique_ptr file_writer; if (!through_db) { - std::unique_ptr file; - env->NewWritableFile(file_name, &file, env_options); + ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name, + FileOptions(env_options), &file_writer, + nullptr)); - std::vector > - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; - file_writer.reset(new WritableFileWriter( - NewLegacyWritableFileWrapper(std::move(file)), file_name, env_options)); int unknown_level = -1; tb = opts.table_factory->NewTableBuilder( TableBuilderOptions( ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - CompressionType::kNoCompression, 0 /* sample_for_compression */, - CompressionOptions(), false /* skip_filters */, - kDefaultColumnFamilyName, unknown_level), - 0 /* column_family_id */, file_writer.get()); + CompressionType::kNoCompression, CompressionOptions(), + 0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level), + file_writer.get()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -130,17 +129,19 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::unique_ptr table_reader; if (!through_db) { - std::unique_ptr raf; - s = env->NewRandomAccessFile(file_name, &raf, env_options); + const auto& fs = env->GetFileSystem(); + FileOptions fopts(env_options); + + std::unique_ptr raf; + s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr); if (!s.ok()) { fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str()); exit(1); } uint64_t file_size; - env->GetFileSize(file_name, &file_size); + fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr); std::unique_ptr file_reader( - new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(raf), - file_name)); + new RandomAccessFileReader(std::move(raf), file_name)); s = opts.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), env_options, ikc), @@ -168,21 +169,21 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, if (!for_iterator) { // Query one existing key; std::string key = MakeKey(r1, r2, through_db); - uint64_t start_time = Now(env, measured_by_nanosecond); + uint64_t start_time = Now(clock, measured_by_nanosecond); if (!through_db) { PinnableSlice value; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; - GetContext get_context(ioptions.user_comparator, - ioptions.merge_operator, ioptions.info_log, - ioptions.statistics, GetContext::kNotFound, - Slice(key), &value, nullptr, &merge_context, - true, &max_covering_tombstone_seq, env); + GetContext get_context( + ioptions.user_comparator, ioptions.merge_operator.get(), + ioptions.logger, ioptions.stats, GetContext::kNotFound, + Slice(key), &value, nullptr, &merge_context, true, + &max_covering_tombstone_seq, clock); s = table_reader->Get(read_options, key, &get_context, nullptr); } else { s = db->Get(read_options, key, &result); } - hist.Add(Now(env, measured_by_nanosecond) - start_time); + hist.Add(Now(clock, measured_by_nanosecond) - start_time); } else { int r2_len; if (if_query_empty_keys) { @@ -196,7 +197,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string start_key = MakeKey(r1, r2, through_db); std::string end_key = MakeKey(r1, r2 + r2_len, through_db); uint64_t total_time = 0; - uint64_t start_time = Now(env, measured_by_nanosecond); + uint64_t start_time = Now(clock, measured_by_nanosecond); Iterator* iter = nullptr; InternalIterator* iiter = nullptr; if (!through_db) { @@ -214,10 +215,10 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, break; } // verify key; - total_time += Now(env, measured_by_nanosecond) - start_time; + total_time += Now(clock, measured_by_nanosecond) - start_time; assert(Slice(MakeKey(r1, r2 + count, through_db)) == (through_db ? iter->key() : iiter->key())); - start_time = Now(env, measured_by_nanosecond); + start_time = Now(clock, measured_by_nanosecond); if (++count >= r2_len) { break; } @@ -229,7 +230,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, assert(false); } delete iter; - total_time += Now(env, measured_by_nanosecond) - start_time; + total_time += Now(clock, measured_by_nanosecond) - start_time; hist.Add(total_time); } } diff --git a/table/table_test.cc b/table/table_test.cc index 3333f2bd239..2c5550a4f7c 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -155,6 +155,9 @@ void Increment(const Comparator* cmp, std::string* key) { } } +const auto kUnknownColumnFamily = + TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; + } // namespace // Helper class for tests to unify the interface between @@ -172,7 +175,7 @@ class Constructor { // Finish constructing the data structure with all the keys that have // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" - void Finish(const Options& options, const ImmutableCFOptions& ioptions, + void Finish(const Options& options, const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, @@ -191,7 +194,7 @@ class Constructor { // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, - const ImmutableCFOptions& ioptions, + const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, @@ -252,9 +255,11 @@ class KeyConvertingIterator : public InternalIterator { Slice key() const override { assert(Valid()); ParsedInternalKey parsed_key; - if (ParseInternalKey(iter_->key(), &parsed_key) != Status::OK()) { - status_ = Status::Corruption("malformed internal key"); - return Slice("corrupted key"); + Status pik_status = + ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */); + if (!pik_status.ok()) { + status_ = pik_status; + return Slice(status_.getState()); } return parsed_key.user_key; } @@ -281,7 +286,7 @@ class BlockConstructor : public Constructor { : Constructor(cmp), comparator_(cmp), block_(nullptr) {} ~BlockConstructor() override { delete block_; } Status FinishImpl(const Options& /*options*/, - const ImmutableCFOptions& /*ioptions*/, + const ImmutableOptions& /*ioptions*/, const MutableCFOptions& /*moptions*/, const BlockBasedTableOptions& table_options, const InternalKeyComparator& /*internal_comparator*/, @@ -336,18 +341,18 @@ class TableConstructor : public Constructor { } ~TableConstructor() override { Reset(); } - Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions, + Status FinishImpl(const Options& options, const ImmutableOptions& ioptions, const MutableCFOptions& moptions, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& internal_comparator, const stl_wrappers::KVMap& kv_map) override { Reset(); soptions.use_mmap_reads = ioptions.allow_mmap_reads; - file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(), - "" /* don't care */)); + std::unique_ptr sink(new test::StringSink()); + file_writer_.reset(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); std::unique_ptr builder; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; if (largest_seqno_ != 0) { // Pretend that it's an external file written by SstFileWriter. @@ -360,10 +365,8 @@ class TableConstructor : public Constructor { builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, internal_comparator, &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - options.compression_opts, false /* skip_filters */, - column_family_name, level_), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + options.compression, options.compression_opts, + kUnknownColumnFamily, column_family_name, level_), file_writer_.get())); for (const auto& kv : kv_map) { @@ -375,25 +378,27 @@ class TableConstructor : public Constructor { } else { builder->Add(kv.first, kv.second); } - EXPECT_TRUE(builder->status().ok()); + EXPECT_OK(builder->status()); } Status s = builder->Finish(); - file_writer_->Flush(); + EXPECT_OK(file_writer_->Flush()); EXPECT_TRUE(s.ok()) << s.ToString(); EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize()); // Open the table uniq_id_ = cur_uniq_id_++; - file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( - TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); + std::unique_ptr source(new test::StringSource( + TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)); + + file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); const bool kSkipFilters = true; const bool kImmortal = true; return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, false, level_, largest_seqno_, &block_cache_tracer_, - moptions.write_buffer_size), + moptions.write_buffer_size, "", uniq_id_), std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); } @@ -421,10 +426,12 @@ class TableConstructor : public Constructor { key, TableReaderCaller::kUncategorized); } - virtual Status Reopen(const ImmutableCFOptions& ioptions, + virtual Status Reopen(const ImmutableOptions& ioptions, const MutableCFOptions& moptions) { - file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource( - TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); + std::unique_ptr source(new test::StringSource( + TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)); + + file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, *last_internal_key_), @@ -443,8 +450,7 @@ class TableConstructor : public Constructor { bool ConvertToInternalKey() { return convert_to_internal_key_; } test::StringSink* TEST_GetSink() { - return ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter( - file_writer_.get()); + return static_cast(file_writer_->writable_file()); } BlockCacheTracer block_cache_tracer_; @@ -482,27 +488,31 @@ class MemTableConstructor: public Constructor { write_buffer_manager_(wb), table_factory_(new SkipListFactory) { options_.memtable_factory = table_factory_; - ImmutableCFOptions ioptions(options_); + ImmutableOptions ioptions(options_); memtable_ = new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_), wb, kMaxSequenceNumber, 0 /* column_family_id */); memtable_->Ref(); } ~MemTableConstructor() override { delete memtable_->Unref(); } - Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions, + Status FinishImpl(const Options&, const ImmutableOptions& ioptions, const MutableCFOptions& /*moptions*/, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& /*internal_comparator*/, const stl_wrappers::KVMap& kv_map) override { delete memtable_->Unref(); - ImmutableCFOptions mem_ioptions(ioptions); + ImmutableOptions mem_ioptions(ioptions); memtable_ = new MemTable(internal_comparator_, mem_ioptions, MutableCFOptions(options_), write_buffer_manager_, kMaxSequenceNumber, 0 /* column_family_id */); memtable_->Ref(); int seq = 1; for (const auto& kv : kv_map) { - memtable_->Add(seq, kTypeValue, kv.first, kv.second); + Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second, + nullptr /* kv_prot_info */); + if (!s.ok()) { + return s; + } seq++; } return Status::OK(); @@ -554,7 +564,7 @@ class DBConstructor: public Constructor { } ~DBConstructor() override { delete db_; } Status FinishImpl(const Options& /*options*/, - const ImmutableCFOptions& /*ioptions*/, + const ImmutableOptions& /*ioptions*/, const MutableCFOptions& /*moptions*/, const BlockBasedTableOptions& /*table_options*/, const InternalKeyComparator& /*internal_comparator*/, @@ -846,7 +856,7 @@ class HarnessTest : public testing::Test { constructor_.reset(new DBConstructor(options_.comparator)); break; } - ioptions_ = ImmutableCFOptions(options_); + ioptions_ = ImmutableOptions(options_); moptions_ = MutableCFOptions(options_); } @@ -1049,7 +1059,7 @@ class HarnessTest : public testing::Test { private: TestArgs args_; Options options_; - ImmutableCFOptions ioptions_; + ImmutableOptions ioptions_; MutableCFOptions moptions_; BlockBasedTableOptions table_options_; std::unique_ptr constructor_; @@ -1130,7 +1140,8 @@ class BlockBasedTableTest &trace_writer)); // Always return Status::OK(). assert(c->block_cache_tracer_ - .StartTrace(env_, trace_opt, std::move(trace_writer)) + .StartTrace(env_->GetSystemClock().get(), trace_opt, + std::move(trace_writer)) .ok()); { std::string user_key = "k01"; @@ -1225,7 +1236,9 @@ class FileChecksumTestHelper { void CreateWriteableFile() { sink_ = new test::StringSink(); - file_writer_.reset(test::GetWritableFileWriter(sink_, "" /* don't care */)); + std::unique_ptr holder(sink_); + file_writer_.reset(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); } void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) { @@ -1265,15 +1278,15 @@ class FileChecksumTestHelper { EXPECT_TRUE(table_builder_->status().ok()); } Status s = table_builder_->Finish(); - file_writer_->Flush(); - EXPECT_TRUE(s.ok()); + EXPECT_OK(file_writer_->Flush()); + EXPECT_OK(s); EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize()); return s; } std::string GetFileChecksum() { - file_writer_->Close(); + EXPECT_OK(file_writer_->Close()); return table_builder_->GetFileChecksum(); } @@ -1286,10 +1299,11 @@ class FileChecksumTestHelper { assert(file_checksum_generator != nullptr); cur_uniq_id_ = checksum_uniq_id_++; test::StringSink* ss_rw = - ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter( - file_writer_.get()); - file_reader_.reset(test::GetRandomAccessFileReader( - new test::StringSource(ss_rw->contents()))); + static_cast(file_writer_->writable_file()); + std::unique_ptr source( + new test::StringSource(ss_rw->contents())); + file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); + std::unique_ptr scratch(new char[2048]); Slice result; uint64_t offset = 0; @@ -1396,9 +1410,8 @@ TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); - ioptions.statistics = options.statistics.get(); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0); @@ -1445,9 +1458,8 @@ uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) { table_options.enable_index_compression = compressed; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); - ioptions.statistics = options.statistics.get(); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); c.ResetTableReader(); @@ -1472,7 +1484,7 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) { BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1506,7 +1518,7 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) { options.table_properties_collector_factories.emplace_back( new DummyPropertiesCollectorFactory2()); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1550,7 +1562,7 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); std::unique_ptr internal_cmp( new InternalKeyComparator(options.comparator)); @@ -1571,7 +1583,8 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) { for (size_t i = 0; i < expected_tombstones.size(); i++) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey parsed_key; - ASSERT_OK(ParseInternalKey(iter->key(), &parsed_key)); + ASSERT_OK( + ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */)); RangeTombstone t(parsed_key, iter->value()); const auto& expected_t = expected_tombstones[i]; ASSERT_EQ(t.start_key_, expected_t.start_key_); @@ -1593,7 +1606,7 @@ TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) { Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1637,7 +1650,7 @@ void PrefetchRange(TableConstructor* c, Options* opt, // reset the cache and reopen the table table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt->table_factory.reset(NewBlockBasedTableFactory(*table_options)); - const ImmutableCFOptions ioptions2(*opt); + const ImmutableOptions ioptions2(*opt); const MutableCFOptions moptions(*opt); ASSERT_OK(c->Reopen(ioptions2, moptions)); @@ -1695,7 +1708,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) { c.Add("k07", std::string(100000, 'x')); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); + const ImmutableOptions ioptions(opt); const MutableCFOptions moptions(opt); c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); c.ResetTableReader(); @@ -1796,7 +1809,7 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { c.Add("cccc2", std::string('a', 56)); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -1855,7 +1868,7 @@ TEST_P(BlockBasedTableTest, NoopTransformSeek) { c.Add(key.Encode().ToString(), "b"); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); c.Finish(options, ioptions, moptions, table_options, internal_comparator, @@ -1893,14 +1906,14 @@ TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) { c.Add(key.Encode().ToString(), "test"); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); c.Finish(options, ioptions, moptions, table_options, internal_comparator, &keys, &kvmap); // TODO(Zhongyi): update test to use MutableCFOptions options.prefix_extractor.reset(NewFixedPrefixTransform(9)); - const ImmutableCFOptions new_ioptions(options); + const ImmutableOptions new_ioptions(options); const MutableCFOptions new_moptions(options); ASSERT_OK(c.Reopen(new_ioptions, new_moptions)); auto reader = c.GetTableReader(); @@ -1957,7 +1970,7 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, &kvmap); @@ -2160,7 +2173,7 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) { BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); TableConstructor c(BytewiseComparator()); @@ -2247,7 +2260,7 @@ TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) { Statistics* stats = options.statistics.get(); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); TableConstructor c(BytewiseComparator()); @@ -2445,7 +2458,7 @@ TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) { options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false, @@ -2520,7 +2533,7 @@ TEST_P(BlockBasedTableTest, IndexSizeStat) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); @@ -2549,7 +2562,7 @@ TEST_P(BlockBasedTableTest, NumBlockStat) { std::vector ks; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); @@ -2570,7 +2583,7 @@ TEST_P(BlockBasedTableTest, TracingGetTest) { SetupTracingTest(&c); std::vector keys; stl_wrappers::KVMap kvmap; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2644,7 +2657,7 @@ TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { SetupTracingTest(&c); std::vector keys; stl_wrappers::KVMap kvmap; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2688,7 +2701,7 @@ TEST_P(BlockBasedTableTest, TracingIterator) { SetupTracingTest(&c); std::vector keys; stl_wrappers::KVMap kvmap; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2818,7 +2831,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); c.Add("key", "value"); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2870,7 +2883,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); c.Add("key", "value"); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); @@ -2950,7 +2963,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { table_options.block_cache = NewLRUCache(1, 4); options.statistics = CreateDBStatistics(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions2(options); + const ImmutableOptions ioptions2(options); const MutableCFOptions moptions2(options); ASSERT_OK(c.Reopen(ioptions2, moptions2)); { @@ -2997,7 +3010,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { std::string user_key = "k01"; InternalKey internal_key(user_key, 0, kTypeValue); c3.Add(internal_key.Encode().ToString(), "hello"); - ImmutableCFOptions ioptions3(options); + ImmutableOptions ioptions3(options); MutableCFOptions moptions3(options); // Generate table without filter policy c3.Finish(options, ioptions3, moptions3, table_options, @@ -3008,7 +3021,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { table_options.filter_policy.reset(NewBloomFilterPolicy(1)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); options.statistics = CreateDBStatistics(); - ImmutableCFOptions ioptions4(options); + ImmutableOptions ioptions4(options); MutableCFOptions moptions4(options); ASSERT_OK(c3.Reopen(ioptions4, moptions4)); reader = dynamic_cast(c3.GetTableReader()); @@ -3092,7 +3105,7 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { InternalKey internal_key(user_key, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c.Add(encoded_key, "hello"); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); // Generate table with filter policy c.Finish(options, ioptions, moptions, table_options, @@ -3180,7 +3193,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) { c.Add("k07", std::string(100000, 'x')); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); + const ImmutableOptions ioptions(opt); const MutableCFOptions moptions(opt); c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); @@ -3195,7 +3208,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) { ASSERT_OK(iter->status()); iter.reset(); - const ImmutableCFOptions ioptions1(opt); + const ImmutableOptions ioptions1(opt); const MutableCFOptions moptions1(opt); ASSERT_OK(c.Reopen(ioptions1, moptions1)); auto table_reader = dynamic_cast(c.GetTableReader()); @@ -3208,7 +3221,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) { // rerun with different block cache table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions2(opt); + const ImmutableOptions ioptions2(opt); const MutableCFOptions moptions2(opt); ASSERT_OK(c.Reopen(ioptions2, moptions2)); table_reader = dynamic_cast(c.GetTableReader()); @@ -3268,7 +3281,7 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) { c.Add("k07", std::string(100000, 'x')); std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); + const ImmutableOptions ioptions(opt); const MutableCFOptions moptions(opt); c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap); @@ -3294,14 +3307,13 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) { // Test the file checksum of block based table TEST_P(BlockBasedTableTest, NoFileChecksum) { Options options; - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); int level = 0; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; FileChecksumTestHelper f(true); @@ -3310,14 +3322,12 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) { builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, *comparator, &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - options.compression_opts, false /* skip_filters */, - column_family_name, level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + options.compression, options.compression_opts, + kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); + ASSERT_OK(f.WriteKVAndFlushTable()); ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName); ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum); } @@ -3327,14 +3337,13 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { new FileChecksumGenCrc32cFactory(); Options options; options.file_checksum_gen_factory.reset(file_checksum_gen_factory); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); int level = 0; - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; FileChecksumGenContext gen_context; @@ -3349,14 +3358,12 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, *comparator, &int_tbl_prop_collector_factories, - options.compression, options.sample_for_compression, - options.compression_opts, false /* skip_filters */, - column_family_name, level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + options.compression, options.compression_opts, + kUnknownColumnFamily, column_family_name, level), f.GetFileWriter())); ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); + ASSERT_OK(f.WriteKVAndFlushTable()); ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c"); std::unique_ptr checksum_crc32c_gen2 = @@ -3386,23 +3393,21 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { plain_table_options.hash_table_ratio = 0; PlainTableFactory factory(plain_table_options); - test::StringSink sink; - std::unique_ptr file_writer( - test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */)); + std::unique_ptr sink(new test::StringSink()); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); Options options; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, unknown_level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, unknown_level), file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { @@ -3412,13 +3417,14 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { builder->Add(key, value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); test::StringSink* ss = - ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(file_writer.get()); + static_cast(file_writer->writable_file()); + std::unique_ptr source( + new test::StringSource(ss->contents(), 72242, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss->contents(), 72242, true))); + new RandomAccessFileReader(std::move(source), "test")); TableProperties* props = nullptr; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), @@ -3443,26 +3449,24 @@ TEST_F(PlainTableTest, NoFileChecksum) { PlainTableFactory factory(plain_table_options); Options options; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; FileChecksumTestHelper f(true); f.CreateWriteableFile(); std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, unknown_level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, unknown_level), f.GetFileWriter())); ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); + ASSERT_OK(f.WriteKVAndFlushTable()); ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName); EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum); } @@ -3478,11 +3482,10 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { new FileChecksumGenCrc32cFactory(); Options options; options.file_checksum_gen_factory.reset(file_checksum_gen_factory); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; int unknown_level = -1; @@ -3496,15 +3499,14 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { f.SetFileChecksumGenerator(checksum_crc32c_gen1.release()); std::unique_ptr builder(factory.NewTableBuilder( - TableBuilderOptions( - ioptions, moptions, ikc, &int_tbl_prop_collector_factories, - kNoCompression, 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, unknown_level), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + TableBuilderOptions(ioptions, moptions, ikc, + &int_tbl_prop_collector_factories, kNoCompression, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, unknown_level), f.GetFileWriter())); ASSERT_OK(f.ResetTableBuilder(std::move(builder))); f.AddKVtoKVMap(1000); - f.WriteKVAndFlushTable(); + ASSERT_OK(f.WriteKVAndFlushTable()); ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c"); std::unique_ptr checksum_crc32c_gen2 = @@ -3529,11 +3531,12 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { std::vector keys; stl_wrappers::KVMap kvmap; Options options; + options.db_host_id = ""; test::PlainInternalKeyComparator internal_comparator(options.comparator); options.compression = kNoCompression; BlockBasedTableOptions table_options; table_options.block_size = 1024; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, internal_comparator, &keys, &kvmap); @@ -3569,16 +3572,16 @@ static void DoCompressionTest(CompressionType comp) { options.compression = comp; BlockBasedTableOptions table_options; table_options.block_size = 1024; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7050)); c.ResetTableReader(); } @@ -3661,19 +3664,33 @@ TEST_F(DBHarnessTest, RandomizedLongDB) { #endif // ROCKSDB_LITE #endif // ROCKSDB_VALGRIND_RUN -class MemTableTest : public testing::Test {}; +class MemTableTest : public testing::Test { + public: + MemTableTest() { + InternalKeyComparator cmp(BytewiseComparator()); + auto table_factory = std::make_shared(); + options_.memtable_factory = table_factory; + ImmutableOptions ioptions(options_); + wb_ = new WriteBufferManager(options_.db_write_buffer_size); + memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_, + kMaxSequenceNumber, 0 /* column_family_id */); + memtable_->Ref(); + } + + ~MemTableTest() { + delete memtable_->Unref(); + delete wb_; + } + + MemTable* GetMemTable() { return memtable_; } + + private: + MemTable* memtable_; + Options options_; + WriteBufferManager* wb_; +}; TEST_F(MemTableTest, Simple) { - InternalKeyComparator cmp(BytewiseComparator()); - auto table_factory = std::make_shared(); - Options options; - options.memtable_factory = table_factory; - ImmutableCFOptions ioptions(options); - WriteBufferManager wb(options.db_write_buffer_size); - MemTable* memtable = - new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, - kMaxSequenceNumber, 0 /* column_family_id */); - memtable->Ref(); WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); ASSERT_OK(batch.Put(std::string("k1"), std::string("v1"))); @@ -3682,7 +3699,7 @@ TEST_F(MemTableTest, Simple) { ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge"))); ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua"))); ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end"))); - ColumnFamilyMemTablesDefault cf_mems_default(memtable); + ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable()); ASSERT_TRUE( WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr) .ok()); @@ -3693,10 +3710,10 @@ TEST_F(MemTableTest, Simple) { std::unique_ptr iter_guard; InternalIterator* iter; if (i == 0) { - iter = memtable->NewIterator(ReadOptions(), &arena); + iter = GetMemTable()->NewIterator(ReadOptions(), &arena); arena_iter_guard.set(iter); } else { - iter = memtable->NewRangeTombstoneIterator( + iter = GetMemTable()->NewRangeTombstoneIterator( ReadOptions(), kMaxSequenceNumber /* read_seq */); iter_guard.reset(iter); } @@ -3710,8 +3727,6 @@ TEST_F(MemTableTest, Simple) { iter->Next(); } } - - delete memtable->Unref(); } // Test the empty key @@ -3893,6 +3908,8 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { table_options.index_block_restart_interval = index_block_restart_interval; if (value_delta_encoding) { table_options.format_version = 4; + } else { + table_options.format_version = 3; } options.table_factory.reset(new BlockBasedTableFactory(table_options)); @@ -3907,7 +3924,7 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { stl_wrappers::KVMap kvmap; std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, &kvmap); @@ -3959,8 +3976,7 @@ class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform { } bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override { - assert(IsValid(src)); - return true; + return IsValid(src); } bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override { @@ -4003,7 +4019,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) { const std::string kDBPath = test::PerThreadDBPath("table_prefix_test"); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyDB(kDBPath, options); + ASSERT_OK(DestroyDB(kDBPath, options)); ROCKSDB_NAMESPACE::DB* db; ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); @@ -4033,15 +4049,15 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) { TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); test::StringSink* sink = new test::StringSink(); - std::unique_ptr file_writer( - test::GetWritableFileWriter(sink, "" /* don't care */)); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); Options options; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; int_tbl_prop_collector_factories.emplace_back( new SstFileWriterPropertiesCollectorFactory(2 /* version */, 0 /* global_seqno*/)); @@ -4049,9 +4065,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, -1), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, -1), file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { @@ -4062,7 +4077,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); test::RandomRWStringSink ss_rw(sink); uint32_t version; @@ -4071,9 +4086,10 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { // Helper function to get version, global_seqno, global_seqno_offset std::function GetVersionAndGlobalSeqno = [&]() { + std::unique_ptr source( + new test::StringSource(ss_rw.contents(), 73342, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); + new RandomAccessFileReader(std::move(source), "")); TableProperties* props = nullptr; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), @@ -4096,16 +4112,18 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { std::string new_global_seqno; PutFixed64(&new_global_seqno, val); - ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno)); + ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(), + nullptr)); }; // Helper function to get the contents of the table InternalIterator std::unique_ptr table_reader; const ReadOptions read_options; std::function GetTableInternalIter = [&]() { + std::unique_ptr source( + new test::StringSource(ss_rw.contents(), 73342, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); + new RandomAccessFileReader(std::move(source), "")); options.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), @@ -4125,7 +4143,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { char current_c = 'a'; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey pik; - ASSERT_OK(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 0); @@ -4146,7 +4164,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { current_c = 'a'; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey pik; - ASSERT_OK(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 10); @@ -4164,7 +4182,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey pik; - ASSERT_OK(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 10); @@ -4183,7 +4201,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { current_c = 'a'; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey pik; - ASSERT_OK(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 3); @@ -4202,7 +4220,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { ASSERT_TRUE(iter->Valid()); ParsedInternalKey pik; - ASSERT_OK(ParseInternalKey(iter->key(), &pik)); + ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */)); ASSERT_EQ(pik.type, ValueType::kTypeValue); ASSERT_EQ(pik.sequence, 3); @@ -4217,23 +4235,22 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); bbto.block_align = true; test::StringSink* sink = new test::StringSink(); - std::unique_ptr file_writer( - test::GetWritableFileWriter(sink, "" /* don't care */)); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); Options options; options.compression = kNoCompression; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, -1), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, -1), file_writer.get())); for (int i = 1; i <= 10000; ++i) { @@ -4246,19 +4263,18 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); - test::RandomRWStringSink ss_rw(sink); + std::unique_ptr source( + new test::StringSource(sink->contents(), 73342, false)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); - + new RandomAccessFileReader(std::move(source), "test")); // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { TableProperties* props = nullptr; - ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), - kBlockBasedTableMagicNumber, ioptions, - &props, true /* compression_type_missing */)); + ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), + kBlockBasedTableMagicNumber, ioptions, &props, + true /* compression_type_missing */)); uint64_t data_block_size = props->data_size / props->num_data_blocks; ASSERT_EQ(data_block_size, 4096); @@ -4275,14 +4291,14 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { bbto.block_align = false; Options options2; options2.table_factory.reset(NewBlockBasedTableFactory(bbto)); - ImmutableCFOptions ioptions2(options2); + ImmutableOptions ioptions2(options2); const MutableCFOptions moptions2(options2); ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(), EnvOptions(), GetPlainInternalComparator(options2.comparator)), - std::move(file_reader), ss_rw.contents().size(), &table_reader)); + std::move(file_reader), sink->contents().size(), &table_reader)); ReadOptions read_options; std::unique_ptr db_iter(table_reader->NewIterator( @@ -4309,26 +4325,25 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { BlockBasedTableOptions bbto = GetBlockBasedTableOptions(); bbto.block_align = true; test::StringSink* sink = new test::StringSink(); - std::unique_ptr file_writer( - test::GetWritableFileWriter(sink, "" /* don't care */)); + std::unique_ptr holder(sink); + std::unique_ptr file_writer(new WritableFileWriter( + std::move(holder), "" /* don't care */, FileOptions())); Options options; options.compression = kNoCompression; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); InternalKeyComparator ikc(options.comparator); - std::vector> - int_tbl_prop_collector_factories; + IntTblPropCollectorFactories int_tbl_prop_collector_factories; std::string column_family_name; std::unique_ptr builder(options.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, - 0 /* sample_for_compression */, CompressionOptions(), - false /* skip_filters */, column_family_name, -1), - TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, + CompressionOptions(), kUnknownColumnFamily, + column_family_name, -1), file_writer.get())); for (int i = 1; i <= 10000; ++i) { @@ -4341,16 +4356,16 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { builder->Add(ik.Encode(), value); } ASSERT_OK(builder->Finish()); - file_writer->Flush(); + ASSERT_OK(file_writer->Flush()); - test::RandomRWStringSink ss_rw(sink); + std::unique_ptr source( + new test::StringSource(sink->contents(), 73342, true)); std::unique_ptr file_reader( - test::GetRandomAccessFileReader( - new test::StringSource(ss_rw.contents(), 73342, true))); + new RandomAccessFileReader(std::move(source), "test")); { RandomAccessFileReader* file = file_reader.get(); - uint64_t file_size = ss_rw.contents().size(); + uint64_t file_size = sink->contents().size(); Footer footer; IOOptions opts; @@ -4424,7 +4439,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { table_options.filter_policy.reset(NewBloomFilterPolicy( 8 /* bits_per_key */, false /* use_block_based_filter */)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ImmutableCFOptions ioptions(options); + ImmutableOptions ioptions(options); MutableCFOptions moptions(options); std::vector keys; stl_wrappers::KVMap kvmap; @@ -4433,10 +4448,11 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // get file reader test::StringSink* table_sink = c.TEST_GetSink(); - std::unique_ptr table_reader{ - test::GetRandomAccessFileReader( - new test::StringSource(table_sink->contents(), 0 /* unique_id */, - false /* allow_mmap_reads */))}; + std::unique_ptr source(new test::StringSource( + table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */)); + + std::unique_ptr table_reader( + new RandomAccessFileReader(std::move(source), "test")); size_t table_size = table_sink->contents().size(); // read footer @@ -4492,7 +4508,7 @@ TEST_P(BlockBasedTableTest, BadOptions) { const std::string kDBPath = test::PerThreadDBPath("block_based_table_bad_options_test"); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - DestroyDB(kDBPath, options); + ASSERT_OK(DestroyDB(kDBPath, options)); ROCKSDB_NAMESPACE::DB* db; ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db)); @@ -4540,9 +4556,9 @@ TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { TailPrefetchStats tpstats; FilePrefetchBuffer buffer(nullptr, 0, 0, false, true); IOOptions opts; - buffer.TryReadFromCache(opts, 500, 10, nullptr); - buffer.TryReadFromCache(opts, 480, 10, nullptr); - buffer.TryReadFromCache(opts, 490, 10, nullptr); + buffer.TryReadFromCache(opts, 500, 10, nullptr, nullptr); + buffer.TryReadFromCache(opts, 480, 10, nullptr, nullptr); + buffer.TryReadFromCache(opts, 490, 10, nullptr, nullptr); ASSERT_EQ(480, buffer.min_offset_read()); } @@ -4572,7 +4588,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { std::vector keys; stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); const InternalKeyComparator internal_comparator(options.comparator); c.Finish(options, ioptions, moptions, table_options, internal_comparator, @@ -4655,7 +4671,7 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) { Options options; BlockBasedTableOptions table_opt(GetBlockBasedTableOptions()); options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_opt, GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); @@ -4695,7 +4711,7 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { table_opt.flush_block_policy_factory = std::make_shared(); options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); - const ImmutableCFOptions ioptions(options); + const ImmutableOptions ioptions(options); const MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_opt, GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); diff --git a/test_util/mock_time_env.cc b/test_util/mock_time_env.cc index 8316406ec59..23888e69e3f 100644 --- a/test_util/mock_time_env.cc +++ b/test_util/mock_time_env.cc @@ -12,7 +12,7 @@ namespace ROCKSDB_NAMESPACE { // TODO: this is a workaround for the different behavior on different platform // for timedwait timeout. Ideally timedwait API should be moved to env. // details: PR #7101. -void MockTimeEnv::InstallTimedWaitFixCallback() { +void MockSystemClock::InstallTimedWaitFixCallback() { #ifndef NDEBUG SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); diff --git a/test_util/mock_time_env.h b/test_util/mock_time_env.h index 1f454144ad1..61dc4e4437b 100644 --- a/test_util/mock_time_env.h +++ b/test_util/mock_time_env.h @@ -6,18 +6,21 @@ #pragma once #include +#include -#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { // NOTE: SpecialEnv offers most of this functionality, along with hooks // for safe DB behavior under a mock time environment, so should be used -// instead of MockTimeEnv for DB tests. -class MockTimeEnv : public EnvWrapper { +// instead of MockSystemClock for DB tests. +class MockSystemClock : public SystemClockWrapper { public: - explicit MockTimeEnv(Env* base) : EnvWrapper(base) {} + explicit MockSystemClock(const std::shared_ptr& base) + : SystemClockWrapper(base) {} + const char* Name() const override { return "MockSystemClock"; } virtual Status GetCurrentTime(int64_t* time_sec) override { assert(time_sec != nullptr); *time_sec = static_cast(current_time_us_ / kMicrosInSecond); @@ -33,9 +36,9 @@ class MockTimeEnv : public EnvWrapper { return current_time_us_ * 1000; } - uint64_t RealNowMicros() { return target()->NowMicros(); } + uint64_t RealNowMicros() { return target_->NowMicros(); } - void set_current_time(uint64_t time_sec) { + void SetCurrentTime(uint64_t time_sec) { assert(time_sec < std::numeric_limits::max() / kMicrosInSecond); assert(time_sec * kMicrosInSecond >= current_time_us_); current_time_us_ = time_sec * kMicrosInSecond; diff --git a/test_util/sync_point.cc b/test_util/sync_point.cc index 16eb4e55315..4c71fc6bc46 100644 --- a/test_util/sync_point.cc +++ b/test_util/sync_point.cc @@ -6,11 +6,9 @@ #include "test_util/sync_point.h" #include -#include #include "test_util/sync_point_impl.h" -int rocksdb_kill_odds = 0; std::vector rocksdb_kill_exclude_prefixes; #ifndef NDEBUG diff --git a/test_util/sync_point.h b/test_util/sync_point.h index 08d6c037abb..775fd5c36e6 100644 --- a/test_util/sync_point.h +++ b/test_util/sync_point.h @@ -13,34 +13,42 @@ #include "rocksdb/rocksdb_namespace.h" -// This is only set from db_stress.cc and for testing only. -// If non-zero, kill at various points in source code with probability 1/this -extern int rocksdb_kill_odds; -// If kill point has a prefix on this list, will skip killing. -extern std::vector rocksdb_kill_exclude_prefixes; - #ifdef NDEBUG // empty in release build -#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds) +#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) +#define TEST_KILL_RANDOM(kill_point) #else namespace ROCKSDB_NAMESPACE { -// Kill the process with probability 1/odds for testing. -extern void TestKillRandom(std::string kill_point, int odds, - const std::string& srcfile, int srcline); // To avoid crashing always at some frequently executed codepaths (during // kill random test), use this factor to reduce odds #define REDUCE_ODDS 2 #define REDUCE_ODDS2 4 -#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds) \ - { \ - if (rocksdb_kill_odds > 0) { \ - TestKillRandom(kill_point, rocksdb_kill_odds, __FILE__, __LINE__); \ - } \ +// A class used to pass when a kill point is reached. +struct KillPoint { + public: + // This is only set from db_stress.cc and for testing only. + // If non-zero, kill at various points in source code with probability 1/this + int rocksdb_kill_odds = 0; + // If kill point has a prefix on this list, will skip killing. + std::vector rocksdb_kill_exclude_prefixes; + // Kill the process with probability 1/odds for testing. + void TestKillRandom(std::string kill_point, int odds, + const std::string& srcfile, int srcline); + + static KillPoint* GetInstance(); +}; + +#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) \ + { \ + KillPoint::GetInstance()->TestKillRandom( \ + kill_point, rocksdb_kill_odds_weight, __FILE__, __LINE__); \ } +#define TEST_KILL_RANDOM(kill_point) TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, 1) } // namespace ROCKSDB_NAMESPACE + #endif #ifdef NDEBUG diff --git a/test_util/sync_point_impl.cc b/test_util/sync_point_impl.cc index e1877e39860..1d87a05fe07 100644 --- a/test_util/sync_point_impl.cc +++ b/test_util/sync_point_impl.cc @@ -7,9 +7,17 @@ #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { +KillPoint* KillPoint::GetInstance() { + static KillPoint kp; + return &kp; +} -void TestKillRandom(std::string kill_point, int odds, - const std::string& srcfile, int srcline) { +void KillPoint::TestKillRandom(std::string kill_point, int odds_weight, + const std::string& srcfile, int srcline) { + if (rocksdb_kill_odds <= 0) { + return; + } + int odds = rocksdb_kill_odds * odds_weight; for (auto& p : rocksdb_kill_exclude_prefixes) { if (kill_point.substr(0, p.length()) == p) { return; @@ -29,7 +37,6 @@ void TestKillRandom(std::string kill_point, int odds, } } - void SyncPoint::Data::LoadDependency(const std::vector& dependencies) { std::lock_guard lock(mutex_); successors_.clear(); @@ -38,6 +45,8 @@ void SyncPoint::Data::LoadDependency(const std::vector& dependenc for (const auto& dependency : dependencies) { successors_[dependency.predecessor].push_back(dependency.successor); predecessors_[dependency.successor].push_back(dependency.predecessor); + point_filter_.Add(dependency.successor); + point_filter_.Add(dependency.predecessor); } cv_.notify_all(); } @@ -54,11 +63,15 @@ void SyncPoint::Data::LoadDependencyAndMarkers( for (const auto& dependency : dependencies) { successors_[dependency.predecessor].push_back(dependency.successor); predecessors_[dependency.successor].push_back(dependency.predecessor); + point_filter_.Add(dependency.successor); + point_filter_.Add(dependency.predecessor); } for (const auto& marker : markers) { successors_[marker.predecessor].push_back(marker.successor); predecessors_[marker.successor].push_back(marker.predecessor); markers_[marker.predecessor].push_back(marker.successor); + point_filter_.Add(marker.predecessor); + point_filter_.Add(marker.successor); } cv_.notify_all(); } @@ -92,6 +105,10 @@ void SyncPoint::Data::Process(const std::string& point, void* cb_arg) { if (!enabled_) { return; } + // Use a filter to prevent mutex lock if possible. + if (!point_filter_.MayContain(point)) { + return; + } std::unique_lock lock(mutex_); auto thread_id = std::this_thread::get_id(); @@ -100,6 +117,7 @@ void SyncPoint::Data::Process(const std::string& point, void* cb_arg) { if (marker_iter != markers_.end()) { for (auto& marked_point : marker_iter->second) { marked_thread_id_.emplace(marked_point, thread_id); + point_filter_.Add(marked_point); } } diff --git a/test_util/sync_point_impl.h b/test_util/sync_point_impl.h index b246c019857..ba818e381fd 100644 --- a/test_util/sync_point_impl.h +++ b/test_util/sync_point_impl.h @@ -3,9 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "test_util/sync_point.h" - #include + #include #include #include @@ -15,15 +14,39 @@ #include #include +#include "memory/concurrent_arena.h" #include "port/port.h" +#include "test_util/sync_point.h" +#include "util/dynamic_bloom.h" #include "util/random.h" #pragma once #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { +// A hacky allocator for single use. +// Arena depends on SyncPoint and create circular dependency. +class SingleAllocator : public Allocator { + public: + char* Allocate(size_t) override { + assert(false); + return nullptr; + } + char* AllocateAligned(size_t bytes, size_t, Logger*) override { + buf_.resize(bytes); + return const_cast(buf_.data()); + } + size_t BlockSize() const override { + assert(false); + return 0; + } + + private: + std::string buf_; +}; + struct SyncPoint::Data { - Data() : enabled_(false) {} + Data() : point_filter_(&alloc_, /*total_bits=*/8192), enabled_(false) {} // Enable proper deletion by subclasses virtual ~Data() {} // successor/predecessor map loaded from LoadDependency @@ -37,6 +60,9 @@ struct SyncPoint::Data { std::condition_variable cv_; // sync points that have been passed through std::unordered_set cleared_points_; + SingleAllocator alloc_; + // A filter before holding mutex to speed up process. + DynamicBloom point_filter_; std::atomic enabled_; int num_callbacks_running_ = 0; @@ -48,6 +74,7 @@ struct SyncPoint::Data { const std::function& callback) { std::lock_guard lock(mutex_); callbacks_[point] = callback; + point_filter_.Add(point); } void ClearCallBack(const std::string& point); diff --git a/test_util/testharness.cc b/test_util/testharness.cc index 50e105c51d3..d8650dafbd7 100644 --- a/test_util/testharness.cc +++ b/test_util/testharness.cc @@ -14,6 +14,14 @@ namespace ROCKSDB_NAMESPACE { namespace test { +#ifdef OS_WIN +#include + +std::string GetPidStr() { return std::to_string(GetCurrentProcessId()); } +#else +std::string GetPidStr() { return std::to_string(getpid()); } +#endif + ::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) { if (s.ok()) { return ::testing::AssertionSuccess(); @@ -32,7 +40,7 @@ std::string TmpDir(Env* env) { std::string PerThreadDBPath(std::string dir, std::string name) { size_t tid = std::hash()(std::this_thread::get_id()); - return dir + "/" + name + "_" + std::to_string(tid); + return dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid); } std::string PerThreadDBPath(std::string name) { diff --git a/test_util/testharness.h b/test_util/testharness.h index 60a195e2b79..739f32cb9b0 100644 --- a/test_util/testharness.h +++ b/test_util/testharness.h @@ -15,6 +15,42 @@ #include #endif +// A "skipped" test has a specific meaning in Facebook infrastructure: the +// test is in good shape and should be run, but something about the +// compilation or execution environment means the test cannot be run. +// Specifically, there is a hole in intended testing if any +// parameterization of a test (e.g. Foo/FooTest.Bar/42) is skipped for all +// tested build configurations/platforms/etc. +// +// If GTEST_SKIP is available, use it. Otherwise, define skip as success. +// +// The GTEST macros do not seem to print the message, even with -verbose, +// so these print to stderr. Note that these do not exit the test themselves; +// calling code should 'return' or similar from the test. +#ifdef GTEST_SKIP_ +#define ROCKSDB_GTEST_SKIP(m) \ + do { \ + fputs("SKIPPED: " m "\n", stderr); \ + GTEST_SKIP_(m); \ + } while (false) /* user ; */ +#else +#define ROCKSDB_GTEST_SKIP(m) \ + do { \ + fputs("SKIPPED: " m "\n", stderr); \ + GTEST_SUCCESS_("SKIPPED: " m); \ + } while (false) /* user ; */ +#endif + +// We add "bypass" as an alternative to ROCKSDB_GTEST_SKIP that is allowed to +// be a permanent condition, e.g. for intentionally omitting or disabling some +// parameterizations for some tests. (Use _DISABLED at the end of the test +// name to disable an entire test.) +#define ROCKSDB_GTEST_BYPASS(m) \ + do { \ + fputs("BYPASSED: " m "\n", stderr); \ + GTEST_SUCCESS_("BYPASSED: " m); \ + } while (false) /* user ; */ + #include #include "rocksdb/env.h" diff --git a/test_util/testutil.cc b/test_util/testutil.cc index d05bb766e27..450598cecb9 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -24,6 +24,7 @@ #include "file/writable_file_writer.h" #include "port/port.h" #include "rocksdb/convenience.h" +#include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/random.h" @@ -171,25 +172,6 @@ const Comparator* ComparatorWithU64Ts() { return &comp_with_u64_ts; } -WritableFileWriter* GetWritableFileWriter(WritableFile* wf, - const std::string& fname) { - std::unique_ptr file(wf); - return new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)), - fname, EnvOptions()); -} - -RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) { - std::unique_ptr file(raf); - return new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file), - "[test RandomAccessFileReader]"); -} - -SequentialFileReader* GetSequentialFileReader(SequentialFile* se, - const std::string& fname) { - std::unique_ptr file(se); - return new SequentialFileReader(NewLegacySequentialFileWrapper(file), fname); -} - void CorruptKeyType(InternalKey* ikey) { std::string keystr = ikey->Encode().ToString(); keystr[keystr.size() - 8] = kTypeLogData; @@ -205,6 +187,38 @@ std::string KeyStr(const std::string& user_key, const SequenceNumber& seq, return k.Encode().ToString(); } +std::string KeyStr(uint64_t ts, const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt) { + std::string user_key_with_ts(user_key); + std::string ts_str; + PutFixed64(&ts_str, ts); + user_key_with_ts.append(ts_str); + return KeyStr(user_key_with_ts, seq, t, corrupt); +} + +bool SleepingBackgroundTask::TimedWaitUntilSleeping(uint64_t wait_time) { + auto abs_time = SystemClock::Default()->NowMicros() + wait_time; + MutexLock l(&mutex_); + while (!sleeping_ || !should_sleep_) { + if (bg_cv_.TimedWait(abs_time)) { + return true; + } + } + return false; +} + +bool SleepingBackgroundTask::TimedWaitUntilDone(uint64_t wait_time) { + auto abs_time = SystemClock::Default()->NowMicros() + wait_time; + MutexLock l(&mutex_); + while (!done_with_sleep_) { + if (bg_cv_.TimedWait(abs_time)) { + return true; + } + } + return false; +} + std::string RandomName(Random* rnd, const size_t len) { std::stringstream ss; for (size_t i = 0; i < len; ++i) { @@ -309,6 +323,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { db_opt->error_if_exists = rnd->Uniform(2); db_opt->is_fd_close_on_exec = rnd->Uniform(2); db_opt->paranoid_checks = rnd->Uniform(2); + db_opt->track_and_verify_wals_in_manifest = rnd->Uniform(2); db_opt->skip_log_error_on_recovery = rnd->Uniform(2); db_opt->skip_stats_update_on_db_open = rnd->Uniform(2); db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2); @@ -370,12 +385,14 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2); cf_opt->memtable_whole_key_filtering = rnd->Uniform(2); cf_opt->enable_blob_files = rnd->Uniform(2); + cf_opt->enable_blob_garbage_collection = rnd->Uniform(2); // double options cf_opt->hard_rate_limit = static_cast(rnd->Uniform(10000)) / 13; cf_opt->soft_rate_limit = static_cast(rnd->Uniform(10000)) / 13; cf_opt->memtable_prefix_bloom_size_ratio = static_cast(rnd->Uniform(10000)) / 20000.0; + cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0; // int options cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100); @@ -458,6 +475,26 @@ bool IsDirectIOSupported(Env* env, const std::string& dir) { return s.ok(); } +bool IsPrefetchSupported(const std::shared_ptr& fs, + const std::string& dir) { + bool supported = false; + std::string tmp = TempFileName(dir, 999); + Random rnd(301); + std::string test_string = rnd.RandomString(4096); + Slice data(test_string); + Status s = WriteStringToFile(fs.get(), data, tmp, true); + if (s.ok()) { + std::unique_ptr file; + auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr); + if (io_s.ok()) { + supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr) + .IsNotSupported()); + } + s = fs->DeleteFile(tmp, IOOptions(), nullptr); + } + return s.ok() && supported; +} + size_t GetLinesCount(const std::string& fname, const std::string& pattern) { std::stringstream ssbuf; std::string line; @@ -475,46 +512,93 @@ size_t GetLinesCount(const std::string& fname, const std::string& pattern) { return count; } - -void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) { - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - // strerror is not thread-safe so should not be used in the "passing" path - // of unit tests (sometimes parallelized) but is OK here where test fails - const char* msg = strerror(errno); - fprintf(stderr, "%s:%s\n", fname.c_str(), msg); - assert(false); - } - - if (offset < 0) { +Status CorruptFile(Env* env, const std::string& fname, int offset, + int bytes_to_corrupt, bool verify_checksum /*=true*/) { + uint64_t size; + Status s = env->GetFileSize(fname, &size); + if (!s.ok()) { + return s; + } else if (offset < 0) { // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { + if (-offset > static_cast(size)) { offset = 0; } else { - offset = static_cast(sbuf.st_size + offset); + offset = static_cast(size + offset); } } - if (offset > sbuf.st_size) { - offset = static_cast(sbuf.st_size); + if (offset > static_cast(size)) { + offset = static_cast(size); } - if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = static_cast(sbuf.st_size - offset); + if (offset + bytes_to_corrupt > static_cast(size)) { + bytes_to_corrupt = static_cast(size - offset); } // Do it std::string contents; - Status s = ReadFileToString(Env::Default(), fname, &contents); - assert(s.ok()); - for (int i = 0; i < bytes_to_corrupt; i++) { - contents[i + offset] ^= 0x80; + s = ReadFileToString(env, fname, &contents); + if (s.ok()) { + for (int i = 0; i < bytes_to_corrupt; i++) { + contents[i + offset] ^= 0x80; + } + s = WriteStringToFile(env, contents, fname); } - s = WriteStringToFile(Env::Default(), contents, fname); - assert(s.ok()); - Options options; - EnvOptions env_options; + if (s.ok() && verify_checksum) { #ifndef ROCKSDB_LITE - assert(!VerifySstFileChecksum(options, env_options, fname).ok()); + Options options; + options.env = env; + EnvOptions env_options; + Status v = VerifySstFileChecksum(options, env_options, fname); + assert(!v.ok()); #endif + } + return s; +} + +Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) { + uint64_t old_length; + Status s = env->GetFileSize(fname, &old_length); + if (!s.ok() || new_length == old_length) { + return s; + } + // Do it + std::string contents; + s = ReadFileToString(env, fname, &contents); + if (s.ok()) { + contents.resize(static_cast(new_length), 'b'); + s = WriteStringToFile(env, contents, fname); + } + return s; +} + +// Try and delete a directory if it exists +Status TryDeleteDir(Env* env, const std::string& dirname) { + bool is_dir = false; + Status s = env->IsDirectory(dirname, &is_dir); + if (s.ok() && is_dir) { + s = env->DeleteDir(dirname); + } + return s; +} + +// Delete a directory if it exists +void DeleteDir(Env* env, const std::string& dirname) { + TryDeleteDir(env, dirname).PermitUncheckedError(); +} + +Status CreateEnvFromSystem(const ConfigOptions& config_options, Env** result, + std::shared_ptr* guard) { + const char* env_uri = getenv("TEST_ENV_URI"); + const char* fs_uri = getenv("TEST_FS_URI"); + if (env_uri || fs_uri) { + return Env::CreateFromUri(config_options, + (env_uri != nullptr) ? env_uri : "", + (fs_uri != nullptr) ? fs_uri : "", result, guard); + } else { + // Neither specified. Use the default + *result = config_options.env; + guard->reset(); + return Status::OK(); + } } } // namespace test diff --git a/test_util/testutil.h b/test_util/testutil.h index 4255a48f2d5..ae6d1dec4ef 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -22,12 +22,11 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/table.h" -#include "table/block_based/block_based_table_factory.h" #include "table/internal_iterator.h" -#include "table/plain/plain_table_factory.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +class FileSystem; class Random; class SequentialFile; class SequentialFileReader; @@ -55,9 +54,10 @@ class ErrorEnv : public EnvWrapper { bool writable_file_error_; int num_writable_file_errors_; - ErrorEnv() : EnvWrapper(Env::Default()), - writable_file_error_(false), - num_writable_file_errors_(0) { } + ErrorEnv(Env* _target) + : EnvWrapper(_target), + writable_file_error_(false), + num_writable_file_errors_(0) {} virtual Status NewWritableFile(const std::string& fname, std::unique_ptr* result, @@ -179,23 +179,16 @@ class VectorIterator : public InternalIterator { std::vector values_; size_t current_; }; -extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf, - const std::string& fname); -extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf); - -extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se, - const std::string& fname); - -class StringSink: public WritableFile { +class StringSink : public FSWritableFile { public: std::string contents_; - explicit StringSink(Slice* reader_contents = nullptr) : - WritableFile(), - contents_(""), - reader_contents_(reader_contents), - last_flush_(0) { + explicit StringSink(Slice* reader_contents = nullptr) + : FSWritableFile(), + contents_(""), + reader_contents_(reader_contents), + last_flush_(0) { if (reader_contents_ != nullptr) { *reader_contents_ = Slice(contents_.data(), 0); } @@ -203,12 +196,15 @@ class StringSink: public WritableFile { const std::string& contents() const { return contents_; } - virtual Status Truncate(uint64_t size) override { + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.resize(static_cast(size)); - return Status::OK(); + return IOStatus::OK(); } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { if (reader_contents_ != nullptr) { assert(reader_contents_->size() <= last_flush_); size_t offset = last_flush_ - reader_contents_->size(); @@ -218,12 +214,17 @@ class StringSink: public WritableFile { last_flush_ = contents_.size(); } - return Status::OK(); + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.append(slice.data(), slice.size()); - return Status::OK(); + return IOStatus::OK(); } void Drop(size_t bytes) { if (reader_contents_ != nullptr) { @@ -240,36 +241,44 @@ class StringSink: public WritableFile { }; // A wrapper around a StringSink to give it a RandomRWFile interface -class RandomRWStringSink : public RandomRWFile { +class RandomRWStringSink : public FSRandomRWFile { public: explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {} - Status Write(uint64_t offset, const Slice& data) override { + IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { if (offset + data.size() > ss_->contents_.size()) { ss_->contents_.resize(static_cast(offset) + data.size(), '\0'); } char* pos = const_cast(ss_->contents_.data() + offset); memcpy(pos, data.data(), data.size()); - return Status::OK(); + return IOStatus::OK(); } - Status Read(uint64_t offset, size_t n, Slice* result, - char* /*scratch*/) const override { + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, + Slice* result, char* /*scratch*/, + IODebugContext* /*dbg*/) const override { *result = Slice(nullptr, 0); if (offset < ss_->contents_.size()) { size_t str_res_sz = std::min(static_cast(ss_->contents_.size() - offset), n); *result = Slice(ss_->contents_.data() + offset, str_res_sz); } - return Status::OK(); + return IOStatus::OK(); } - Status Flush() override { return Status::OK(); } + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Sync() override { return Status::OK(); } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } - Status Close() override { return Status::OK(); } + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } const std::string& contents() const { return ss_->contents(); } @@ -280,34 +289,42 @@ class RandomRWStringSink : public RandomRWFile { // Like StringSink, this writes into a string. Unlink StringSink, it // has some initial content and overwrites it, just like a recycled // log file. -class OverwritingStringSink : public WritableFile { +class OverwritingStringSink : public FSWritableFile { public: explicit OverwritingStringSink(Slice* reader_contents) - : WritableFile(), + : FSWritableFile(), contents_(""), reader_contents_(reader_contents), last_flush_(0) {} const std::string& contents() const { return contents_; } - virtual Status Truncate(uint64_t size) override { + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.resize(static_cast(size)); - return Status::OK(); + return IOStatus::OK(); } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { + IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { if (last_flush_ < contents_.size()) { assert(reader_contents_->size() >= contents_.size()); memcpy((char*)reader_contents_->data() + last_flush_, contents_.data() + last_flush_, contents_.size() - last_flush_); last_flush_ = contents_.size(); } - return Status::OK(); + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { contents_.append(slice.data(), slice.size()); - return Status::OK(); + return IOStatus::OK(); } void Drop(size_t bytes) { contents_.resize(contents_.size() - bytes); @@ -320,7 +337,7 @@ class OverwritingStringSink : public WritableFile { size_t last_flush_; }; -class StringSource: public RandomAccessFile { +class StringSource : public FSRandomAccessFile { public: explicit StringSource(const Slice& contents, uint64_t uniq_id = 0, bool mmap = false) @@ -333,11 +350,23 @@ class StringSource: public RandomAccessFile { uint64_t Size() const { return contents_.size(); } - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { + IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + // If we are using mmap_, it is equivalent to performing a prefetch + if (mmap_) { + return IOStatus::OK(); + } else { + return IOStatus::NotSupported("Prefetch not supported"); + } + } + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/, + Slice* result, char* scratch, + IODebugContext* /*dbg*/) const override { total_reads_++; if (offset > contents_.size()) { - return Status::InvalidArgument("invalid Read offset"); + return IOStatus::InvalidArgument("invalid Read offset"); } if (offset + n > contents_.size()) { n = contents_.size() - static_cast(offset); @@ -348,10 +377,10 @@ class StringSource: public RandomAccessFile { } else { *result = Slice(&contents_[static_cast(offset)], n); } - return Status::OK(); + return IOStatus::OK(); } - virtual size_t GetUniqueId(char* id, size_t max_size) const override { + size_t GetUniqueId(char* id, size_t max_size) const override { if (max_size < 20) { return 0; } @@ -373,13 +402,6 @@ class StringSource: public RandomAccessFile { mutable int total_reads_; }; -inline StringSink* GetStringSinkFromLegacyWriter( - const WritableFileWriter* writer) { - LegacyWritableFileWrapper* file = - static_cast(writer->writable_file()); - return static_cast(file->target()); -} - class NullLogger : public Logger { public: using Logger::Logv; @@ -394,6 +416,10 @@ extern std::string KeyStr(const std::string& user_key, const SequenceNumber& seq, const ValueType& t, bool corrupt = false); +extern std::string KeyStr(uint64_t ts, const std::string& user_key, + const SequenceNumber& seq, const ValueType& t, + bool corrupt = false); + class SleepingBackgroundTask { public: SleepingBackgroundTask() @@ -427,16 +453,8 @@ class SleepingBackgroundTask { // otherwise times out. // wait_time is in microseconds. // Returns true when times out, false otherwise. - bool TimedWaitUntilSleeping(uint64_t wait_time) { - auto abs_time = Env::Default()->NowMicros() + wait_time; - MutexLock l(&mutex_); - while (!sleeping_ || !should_sleep_) { - if (bg_cv_.TimedWait(abs_time)) { - return true; - } - } - return false; - } + bool TimedWaitUntilSleeping(uint64_t wait_time); + void WakeUp() { MutexLock l(&mutex_); should_sleep_ = false; @@ -450,16 +468,8 @@ class SleepingBackgroundTask { } // Similar to TimedWaitUntilSleeping. // Waits until the task is done. - bool TimedWaitUntilDone(uint64_t wait_time) { - auto abs_time = Env::Default()->NowMicros() + wait_time; - MutexLock l(&mutex_); - while (!done_with_sleep_) { - if (bg_cv_.TimedWait(abs_time)) { - return true; - } - } - return false; - } + bool TimedWaitUntilDone(uint64_t wait_time); + bool WokenUp() { MutexLock l(&mutex_); return should_sleep_ == false; @@ -522,176 +532,220 @@ inline std::string EncodeInt(uint64_t x) { return result; } - class SeqStringSource : public SequentialFile { +class SeqStringSource : public FSSequentialFile { + public: + SeqStringSource(const std::string& data, std::atomic* read_count) + : data_(data), offset_(0), read_count_(read_count) {} + ~SeqStringSource() override {} + IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result, + char* scratch, IODebugContext* /*dbg*/) override { + std::string output; + if (offset_ < data_.size()) { + n = std::min(data_.size() - offset_, n); + memcpy(scratch, data_.data() + offset_, n); + offset_ += n; + *result = Slice(scratch, n); + } else { + return IOStatus::InvalidArgument( + "Attempt to read when it already reached eof."); + } + (*read_count_)++; + return IOStatus::OK(); + } + + IOStatus Skip(uint64_t n) override { + if (offset_ >= data_.size()) { + return IOStatus::InvalidArgument( + "Attempt to read when it already reached eof."); + } + // TODO(yhchiang): Currently doesn't handle the overflow case. + offset_ += static_cast(n); + return IOStatus::OK(); + } + + private: + std::string data_; + size_t offset_; + std::atomic* read_count_; +}; + +class StringFS : public FileSystemWrapper { + public: + class StringSink : public FSWritableFile { public: - SeqStringSource(const std::string& data, std::atomic* read_count) - : data_(data), offset_(0), read_count_(read_count) {} - ~SeqStringSource() override {} - Status Read(size_t n, Slice* result, char* scratch) override { - std::string output; - if (offset_ < data_.size()) { - n = std::min(data_.size() - offset_, n); - memcpy(scratch, data_.data() + offset_, n); - offset_ += n; - *result = Slice(scratch, n); - } else { - return Status::InvalidArgument( - "Attemp to read when it already reached eof."); - } - (*read_count_)++; - return Status::OK(); - } - Status Skip(uint64_t n) override { - if (offset_ >= data_.size()) { - return Status::InvalidArgument( - "Attemp to read when it already reached eof."); - } - // TODO(yhchiang): Currently doesn't handle the overflow case. - offset_ += static_cast(n); - return Status::OK(); + explicit StringSink(std::string* contents) + : FSWritableFile(), contents_(contents) {} + IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_->resize(static_cast(size)); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& slice, const IOOptions& /*opts*/, + IODebugContext* /*dbg*/) override { + contents_->append(slice.data(), slice.size()); + return IOStatus::OK(); } private: - std::string data_; - size_t offset_; - std::atomic* read_count_; + std::string* contents_; }; - class StringEnv : public EnvWrapper { - public: - class StringSink : public WritableFile { - public: - explicit StringSink(std::string* contents) - : WritableFile(), contents_(contents) {} - virtual Status Truncate(uint64_t size) override { - contents_->resize(static_cast(size)); - return Status::OK(); - } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { return Status::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { - contents_->append(slice.data(), slice.size()); - return Status::OK(); - } - - private: - std::string* contents_; - }; - - explicit StringEnv(Env* t) : EnvWrapper(t) {} - ~StringEnv() override {} - - const std::string& GetContent(const std::string& f) { return files_[f]; } - - const Status WriteToNewFile(const std::string& file_name, + explicit StringFS(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + ~StringFS() override {} + + const std::string& GetContent(const std::string& f) { return files_[f]; } + + const IOStatus WriteToNewFile(const std::string& file_name, const std::string& content) { - std::unique_ptr r; - auto s = NewWritableFile(file_name, &r, EnvOptions()); - if (s.ok()) { - s = r->Append(content); - } - if (s.ok()) { - s = r->Flush(); - } - if (s.ok()) { - s = r->Close(); - } - assert(!s.ok() || files_[file_name] == content); - return s; - } - - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, - std::unique_ptr* r, - const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist", f); - } - r->reset(new SeqStringSource(iter->second, &num_seq_file_read_)); - return Status::OK(); - } - Status NewRandomAccessFile(const std::string& /*f*/, - std::unique_ptr* /*r*/, - const EnvOptions& /*options*/) override { - return Status::NotSupported(); - } - Status NewWritableFile(const std::string& f, - std::unique_ptr* r, - const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter != files_.end()) { - return Status::IOError("The specified file already exists", f); - } - r->reset(new StringSink(&files_[f])); - return Status::OK(); - } - virtual Status NewDirectory( - const std::string& /*name*/, - std::unique_ptr* /*result*/) override { - return Status::NotSupported(); - } - Status FileExists(const std::string& f) override { - if (files_.find(f) == files_.end()) { - return Status::NotFound(); - } - return Status::OK(); - } - Status GetChildren(const std::string& /*dir*/, - std::vector* /*r*/) override { - return Status::NotSupported(); - } - Status DeleteFile(const std::string& f) override { - files_.erase(f); - return Status::OK(); - } - Status CreateDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status CreateDirIfMissing(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status DeleteDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status GetFileSize(const std::string& f, uint64_t* s) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist:", f); - } - *s = iter->second.size(); - return Status::OK(); - } - - Status GetFileModificationTime(const std::string& /*fname*/, - uint64_t* /*file_mtime*/) override { - return Status::NotSupported(); - } - - Status RenameFile(const std::string& /*s*/, - const std::string& /*t*/) override { - return Status::NotSupported(); - } - - Status LinkFile(const std::string& /*s*/, - const std::string& /*t*/) override { - return Status::NotSupported(); - } - - Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { - return Status::NotSupported(); - } - - Status UnlockFile(FileLock* /*l*/) override { - return Status::NotSupported(); - } - - std::atomic num_seq_file_read_; + std::unique_ptr r; + FileOptions file_opts; + IOOptions io_opts; - protected: - std::unordered_map files_; - }; + auto s = NewWritableFile(file_name, file_opts, &r, nullptr); + if (s.ok()) { + s = r->Append(content, io_opts, nullptr); + } + if (s.ok()) { + s = r->Flush(io_opts, nullptr); + } + if (s.ok()) { + s = r->Close(io_opts, nullptr); + } + assert(!s.ok() || files_[file_name] == content); + return s; + } + + // The following text is boilerplate that forwards all methods to target() + IOStatus NewSequentialFile(const std::string& f, + const FileOptions& /*options*/, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return IOStatus::NotFound("The specified file does not exist", f); + } + r->reset(new SeqStringSource(iter->second, &num_seq_file_read_)); + return IOStatus::OK(); + } + + IOStatus NewRandomAccessFile(const std::string& /*f*/, + const FileOptions& /*options*/, + std::unique_ptr* /*r*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus NewWritableFile(const std::string& f, const FileOptions& /*options*/, + std::unique_ptr* r, + IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter != files_.end()) { + return IOStatus::IOError("The specified file already exists", f); + } + r->reset(new StringSink(&files_[f])); + return IOStatus::OK(); + } + IOStatus NewDirectory(const std::string& /*name*/, + const IOOptions& /*options*/, + std::unique_ptr* /*result*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus FileExists(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (files_.find(f) == files_.end()) { + return IOStatus::NotFound(); + } + return IOStatus::OK(); + } + + IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*options*/, + std::vector* /*r*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + files_.erase(f); + return IOStatus::OK(); + } + + IOStatus CreateDir(const std::string& /*d*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus CreateDirIfMissing(const std::string& /*d*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus DeleteDir(const std::string& /*d*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/, + uint64_t* s, IODebugContext* /*dbg*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return IOStatus::NotFound("The specified file does not exist:", f); + } + *s = iter->second.size(); + return IOStatus::OK(); + } + + IOStatus GetFileModificationTime(const std::string& /*fname*/, + const IOOptions& /*options*/, + uint64_t* /*file_mtime*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus RenameFile(const std::string& /*s*/, const std::string& /*t*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus LinkFile(const std::string& /*s*/, const std::string& /*t*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus LockFile(const std::string& /*f*/, const IOOptions& /*options*/, + FileLock** /*l*/, IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + IOStatus UnlockFile(FileLock* /*l*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::NotSupported(); + } + + std::atomic num_seq_file_read_; + + protected: + std::unordered_map files_; +}; // Randomly initialize the given DBOptions void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); @@ -792,6 +846,9 @@ std::string RandomName(Random* rnd, const size_t len); bool IsDirectIOSupported(Env* env, const std::string& dir); +bool IsPrefetchSupported(const std::shared_ptr& fs, + const std::string& dir); + // Return the number of lines where a given pattern was found in a file. size_t GetLinesCount(const std::string& fname, const std::string& pattern); @@ -800,8 +857,19 @@ size_t GetLinesCount(const std::string& fname, const std::string& pattern); // Tries to set TEST_TMPDIR to a directory supporting direct IO. void ResetTmpDirForDirectIO(); +Status CorruptFile(Env* env, const std::string& fname, int offset, + int bytes_to_corrupt, bool verify_checksum = true); +Status TruncateFile(Env* env, const std::string& fname, uint64_t length); + +// Try and delete a directory if it exists +Status TryDeleteDir(Env* env, const std::string& dirname); -void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt); +// Delete a directory if it exists +void DeleteDir(Env* env, const std::string& dirname); +// Creates an Env from the system environment by looking at the system +// environment variables. +Status CreateEnvFromSystem(const ConfigOptions& options, Env** result, + std::shared_ptr* guard); } // namespace test } // namespace ROCKSDB_NAMESPACE diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc index a9410f5fc47..28f16a5e71b 100644 --- a/test_util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -349,6 +349,7 @@ Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets, static_cast(key.size()), key.data(), int_value); total += int_value; } + iter->status().PermitUncheckedError(); delete iter; } diff --git a/third-party/folly/folly/synchronization/DistributedMutex-inl.h b/third-party/folly/folly/synchronization/DistributedMutex-inl.h index ca5650de696..6e250c94d25 100644 --- a/third-party/folly/folly/synchronization/DistributedMutex-inl.h +++ b/third-party/folly/folly/synchronization/DistributedMutex-inl.h @@ -1374,7 +1374,8 @@ inline std::uintptr_t tryWake( // we need release here because of the write to waker_ and also because we // are unlocking the mutex, the thread we do the handoff to here should // see the modified data - new (&waiter->metadata_) Metadata(waker, bit_cast(sleepers)); + new (&waiter->metadata_) + Metadata(waker, folly::bit_cast(sleepers)); waiter->futex_.store(kWake, std::memory_order_release); return 0; } diff --git a/third-party/gcc/ppc-asm.h b/third-party/gcc/ppc-asm.h new file mode 100644 index 00000000000..e0bce9c5aec --- /dev/null +++ b/third-party/gcc/ppc-asm.h @@ -0,0 +1,390 @@ +/* PowerPC asm definitions for GNU C. + +Copyright (C) 2002-2020 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Under winnt, 1) gas supports the following as names and 2) in particular + defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */ + +#define r0 0 +#define sp 1 +#define toc 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 + +#define f0 0 +#define f1 1 +#define f2 2 +#define f3 3 +#define f4 4 +#define f5 5 +#define f6 6 +#define f7 7 +#define f8 8 +#define f9 9 +#define f10 10 +#define f11 11 +#define f12 12 +#define f13 13 +#define f14 14 +#define f15 15 +#define f16 16 +#define f17 17 +#define f18 18 +#define f19 19 +#define f20 20 +#define f21 21 +#define f22 22 +#define f23 23 +#define f24 24 +#define f25 25 +#define f26 26 +#define f27 27 +#define f28 28 +#define f29 29 +#define f30 30 +#define f31 31 + +#ifdef __VSX__ +#define f32 32 +#define f33 33 +#define f34 34 +#define f35 35 +#define f36 36 +#define f37 37 +#define f38 38 +#define f39 39 +#define f40 40 +#define f41 41 +#define f42 42 +#define f43 43 +#define f44 44 +#define f45 45 +#define f46 46 +#define f47 47 +#define f48 48 +#define f49 49 +#define f50 50 +#define f51 51 +#define f52 52 +#define f53 53 +#define f54 54 +#define f55 55 +#define f56 56 +#define f57 57 +#define f58 58 +#define f59 59 +#define f60 60 +#define f61 61 +#define f62 62 +#define f63 63 +#endif + +#ifdef __ALTIVEC__ +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#endif + +#ifdef __VSX__ +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#define vs14 14 +#define vs15 15 +#define vs16 16 +#define vs17 17 +#define vs18 18 +#define vs19 19 +#define vs20 20 +#define vs21 21 +#define vs22 22 +#define vs23 23 +#define vs24 24 +#define vs25 25 +#define vs26 26 +#define vs27 27 +#define vs28 28 +#define vs29 29 +#define vs30 30 +#define vs31 31 +#define vs32 32 +#define vs33 33 +#define vs34 34 +#define vs35 35 +#define vs36 36 +#define vs37 37 +#define vs38 38 +#define vs39 39 +#define vs40 40 +#define vs41 41 +#define vs42 42 +#define vs43 43 +#define vs44 44 +#define vs45 45 +#define vs46 46 +#define vs47 47 +#define vs48 48 +#define vs49 49 +#define vs50 50 +#define vs51 51 +#define vs52 52 +#define vs53 53 +#define vs54 54 +#define vs55 55 +#define vs56 56 +#define vs57 57 +#define vs58 58 +#define vs59 59 +#define vs60 60 +#define vs61 61 +#define vs62 62 +#define vs63 63 +#endif + +/* + * Macros to glue together two tokens. + */ + +#ifdef __STDC__ +#define XGLUE(a,b) a##b +#else +#define XGLUE(a,b) a/**/b +#endif + +#define GLUE(a,b) XGLUE(a,b) + +/* + * Macros to begin and end a function written in assembler. If -mcall-aixdesc + * or -mcall-nt, create a function descriptor with the given name, and create + * the real function with one or two leading periods respectively. + */ + +#if defined(__powerpc64__) && _CALL_ELF == 2 + +/* Defining "toc" above breaks @toc in assembler code. */ +#undef toc + +#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name) +#ifdef __PCREL__ +#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc) +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): \ + .localentry FUNC_NAME(name),1 +#else +#define JUMP_TARGET(name) FUNC_NAME(name) +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): \ +0: addis 2,12,(.TOC.-0b)@ha; \ + addi 2,2,(.TOC.-0b)@l; \ + .localentry FUNC_NAME(name),.-FUNC_NAME(name) +#endif /* !__PCREL__ */ + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden FUNC_NAME(name); + +#define FUNC_END(name) \ + .size FUNC_NAME(name),.-FUNC_NAME(name) + +#elif defined (__powerpc64__) + +#define FUNC_NAME(name) GLUE(.,name) +#define JUMP_TARGET(name) FUNC_NAME(name) +#define FUNC_START(name) \ + .section ".opd","aw"; \ +name: \ + .quad GLUE(.,name); \ + .quad .TOC.@tocbase; \ + .quad 0; \ + .previous; \ + .type GLUE(.,name),@function; \ + .globl name; \ + .globl GLUE(.,name); \ +GLUE(.,name): + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden name; \ + .hidden GLUE(.,name); + +#define FUNC_END(name) \ +GLUE(.L,name): \ + .size GLUE(.,name),GLUE(.L,name)-GLUE(.,name) + +#elif defined(_CALL_AIXDESC) + +#ifdef _RELOCATABLE +#define DESC_SECTION ".got2" +#else +#define DESC_SECTION ".got1" +#endif + +#define FUNC_NAME(name) GLUE(.,name) +#define JUMP_TARGET(name) FUNC_NAME(name) +#define FUNC_START(name) \ + .section DESC_SECTION,"aw"; \ +name: \ + .long GLUE(.,name); \ + .long _GLOBAL_OFFSET_TABLE_; \ + .long 0; \ + .previous; \ + .type GLUE(.,name),@function; \ + .globl name; \ + .globl GLUE(.,name); \ +GLUE(.,name): + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden name; \ + .hidden GLUE(.,name); + +#define FUNC_END(name) \ +GLUE(.L,name): \ + .size GLUE(.,name),GLUE(.L,name)-GLUE(.,name) + +#else + +#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name) +#if defined __PIC__ || defined __pic__ +#define JUMP_TARGET(name) FUNC_NAME(name@plt) +#else +#define JUMP_TARGET(name) FUNC_NAME(name) +#endif +#define FUNC_START(name) \ + .type FUNC_NAME(name),@function; \ + .globl FUNC_NAME(name); \ +FUNC_NAME(name): + +#define HIDDEN_FUNC(name) \ + FUNC_START(name) \ + .hidden FUNC_NAME(name); + +#define FUNC_END(name) \ +GLUE(.L,name): \ + .size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name) +#endif + +#ifdef IN_GCC +/* For HAVE_GAS_CFI_DIRECTIVE. */ +#include "auto-host.h" + +#ifdef HAVE_GAS_CFI_DIRECTIVE +# define CFI_STARTPROC .cfi_startproc +# define CFI_ENDPROC .cfi_endproc +# define CFI_OFFSET(reg, off) .cfi_offset reg, off +# define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg +# define CFI_RESTORE(reg) .cfi_restore reg +#else +# define CFI_STARTPROC +# define CFI_ENDPROC +# define CFI_OFFSET(reg, off) +# define CFI_DEF_CFA_REGISTER(reg) +# define CFI_RESTORE(reg) +#endif +#endif + +#if defined __linux__ && !defined __powerpc64__ + .section .note.GNU-stack + .previous +#endif diff --git a/tools/backup_db.sh b/tools/backup_db.sh new file mode 100755 index 00000000000..aa82f1dbaed --- /dev/null +++ b/tools/backup_db.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# + +if [ "$#" -lt 2 ]; then + echo "usage: ${BASH_SOURCE[0]} " + exit 1 +fi + +db_dir="$1" +backup_dir="$2" + +echo "== Backing up DB $db_dir to $backup_dir" +./ldb backup --db="$db_dir" --backup_dir="$backup_dir" diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc index bffd480b435..29ec8cb91ba 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -19,6 +19,7 @@ #include #include "monitoring/histogram.h" +#include "rocksdb/system_clock.h" #include "util/gflags_compat.h" #include "util/string_util.h" @@ -1519,6 +1520,7 @@ Status BlockCacheTraceAnalyzer::RecordAccess( } Status BlockCacheTraceAnalyzer::Analyze() { + SystemClock* clock = env_->GetSystemClock().get(); std::unique_ptr reader; Status s = Status::OK(); if (is_human_readable_trace_file_) { @@ -1542,7 +1544,7 @@ Status BlockCacheTraceAnalyzer::Analyze() { return s; } } - uint64_t start = env_->NowMicros(); + uint64_t start = clock->NowMicros(); uint64_t time_interval = 0; while (s.ok()) { BlockCacheTraceRecord access; @@ -1568,7 +1570,7 @@ Status BlockCacheTraceAnalyzer::Analyze() { cache_simulator_->Access(access); } access_sequence_number_++; - uint64_t now = env_->NowMicros(); + uint64_t now = clock->NowMicros(); uint64_t duration = (now - start) / kMicrosInSecond; if (duration > 10 * time_interval) { uint64_t trace_duration = @@ -1582,7 +1584,7 @@ Status BlockCacheTraceAnalyzer::Analyze() { time_interval++; } } - uint64_t now = env_->NowMicros(); + uint64_t now = clock->NowMicros(); uint64_t duration = (now - start) / kMicrosInSecond; uint64_t trace_duration = trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_; diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h index 74fc22b10fd..4436e0b77a1 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h @@ -103,7 +103,8 @@ struct BlockAccessInfo { num_referenced_key_exist_in_block++; if (referenced_data_size > block_size && block_size != 0) { ParsedInternalKey internal_key; - Status s = ParseInternalKey(access.referenced_key, &internal_key); + Status s = ParseInternalKey(access.referenced_key, &internal_key, + false /* log_err_key */); // TODO assert(s.ok()); // TODO } } else { diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index c26795d60ad..91bd30652f7 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -9,7 +9,7 @@ int main() { fprintf(stderr, "Please install gflags to run block_cache_trace_analyzer_test\n"); - return 1; + return 0; } #else @@ -44,7 +44,7 @@ const size_t kArgBufferSize = 100000; class BlockCacheTracerTest : public testing::Test { public: BlockCacheTracerTest() { - test_path_ = test::PerThreadDBPath("block_cache_tracer_test"); + test_path_ = test::PerThreadDBPath("block_cache_trace_analyzer_test"); env_ = ROCKSDB_NAMESPACE::Env::Default(); EXPECT_OK(env_->CreateDir(test_path_)); trace_file_path_ = test_path_ + "/block_cache_trace"; @@ -225,7 +225,9 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) { std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer)); - BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer)); + const auto& clock = env_->GetSystemClock(); + BlockCacheTraceWriter writer(clock.get(), trace_opt, + std::move(trace_writer)); ASSERT_OK(writer.WriteHeader()); WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50); ASSERT_OK(env_->FileExists(trace_file_path_)); @@ -610,9 +612,11 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) { // kSSTStoringEvenKeys. TraceOptions trace_opt; std::unique_ptr trace_writer; + const auto& clock = env_->GetSystemClock(); ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer)); - BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer)); + BlockCacheTraceWriter writer(clock.get(), trace_opt, + std::move(trace_writer)); ASSERT_OK(writer.WriteHeader()); // Write blocks of different types. WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock, diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index e9424196d5c..5bba01daefa 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -1,24 +1,82 @@ #!/usr/bin/env bash # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # -# A shell script to load some pre generated data file to a DB using ldb tool -# ./ldb needs to be avaible to be executed. +# A shell script to build and run different versions of ldb to check for +# expected forward and backward compatibility with "current" version. The +# working copy must have no uncommitted changes. +# +# Usage: